From 13181fd975bccb25d1d1126347872ef68279d567 Mon Sep 17 00:00:00 2001 From: Shijie <505749828@qq.com> Date: Thu, 27 Oct 2022 14:54:41 +0800 Subject: [PATCH 01/91] Add launch_bounds (#47285) --- .../operators/fused/fused_dropout_act_bias.h | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 6b2cdfb6a8d2f..e3e19d9ea6ebc 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -256,17 +256,19 @@ template -__global__ void FusedDropoutActBiasGrad(Functor act_grad, - const T *dout, - const MaskType *mask, - const T *src, - const T *bias, - const T factor, - const int64_t rows, - const int64_t cols, - T *dx, - T *dbias) { + typename Functor, + int THREADS_PER_CTA = BlockSizeX *BlockSizeY> +__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad( + Functor act_grad, + const T *dout, + const MaskType *mask, + const T *src, + const T *bias, + const T factor, + const int64_t rows, + const int64_t cols, + T *dx, + T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; using LoadT = phi::AlignedVector; From 8dca988214a2539235251b70d78a79c19f3f1492 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Thu, 27 Oct 2022 15:38:09 +0800 Subject: [PATCH 02/91] Fix the symbol missing bug about cinn. (#47347) --- paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc | 4 ++-- paddle/fluid/inference/paddle_inference_custom_device.map | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index 77b7bd7e338f1..e97a56a743e25 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -646,8 +646,8 @@ void SearchAllSubgraphs(Graph* graph) { VLOG(4) << "The allowed Cinn Ops: " << FLAGS_allow_cinn_ops; VLOG(4) << "The denied Cinn Ops: " << FLAGS_deny_cinn_ops; std::vector clusters = CinnSubgraphDetector(graph, teller)(); - VLOG(3) << "--- [build_cinn_pass] detected " << clusters.size() - << " cinn supported subgraphs"; + LOG(INFO) << "--- [build_cinn_pass] detected " << clusters.size() + << " cinn supported subgraphs"; auto cluster_debug_info = [](const GraphNodeSet& cluster) { std::string res = "("; diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map index d78860e0a2070..391842a4c6160 100644 --- a/paddle/fluid/inference/paddle_inference_custom_device.map +++ b/paddle/fluid/inference/paddle_inference_custom_device.map @@ -6,6 +6,7 @@ *phi*; *FLAGS_*; PD_*; + *cinn*; local: *; }; From daf98c15451f70caae5626baa43d006348bc6410 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Thu, 27 Oct 2022 16:27:59 +0800 Subject: [PATCH 03/91] New precise map (#47389) * precise_test_map_optimization * modify get_pr_ut.py * apply new map to coverage,test=coverage * apply new map to coverage,test=coverage * apply new map to coverage,test=coverage * apply new map to coverage,test=coverage * apply new map to coverage,test=coverage * apply new map to coverage,test=coverage --- tools/coverage/paddle_coverage.sh | 3 +++ tools/get_pr_ut.py | 10 ++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh index 47a124c3c77cc..1d62608f9f961 100644 --- a/tools/coverage/paddle_coverage.sh +++ b/tools/coverage/paddle_coverage.sh @@ -50,6 +50,8 @@ function gen_full_html_report() { '/paddle/paddle/fluid/string/*' \ '/paddle/paddle/fluid/eager/*' \ '/paddle/paddle/phi/*' \ + '/paddle/paddle/utils/*' \ + -o coverage-full.tmp \ --rc lcov_branch_coverage=0 @@ -63,6 +65,7 @@ function gen_full_html_report() { '/paddle/paddle/fluid/inference/api/demo_ci/*' \ '/paddle/paddle/fluid/eager/tests/*' \ '/paddle/paddle/phi/tests/*' \ + -o coverage-full.tmp \ --rc lcov_branch_coverage=0 diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index ba8dbeb1da572..4dbd32d962636 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -303,7 +303,7 @@ def get_pr_ut(self): file_ut_map = None ret = self.__urlretrieve( - 'https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json', + 'https://paddle-docker-tar.bj.bcebos.com/pre_test_tmp/ut_file_map.json', 'ut_file_map.json', ) if not ret: @@ -326,9 +326,7 @@ def get_pr_ut(self): if filename.startswith(PADDLE_ROOT + 'python/'): file_list.append(filename) elif filename.startswith(PADDLE_ROOT + 'paddle/'): - if filename.startswith( - (PADDLE_ROOT + 'paddle/infrt', PADDLE_ROOT + 'paddle/utils') - ): + if filename.startswith((PADDLE_ROOT + 'paddle/infrt')): filterFiles.append(filename) elif filename.startswith(PADDLE_ROOT + 'paddle/scripts'): if filename.startswith( @@ -354,7 +352,7 @@ def get_pr_ut(self): if len(file_list) == 0: ut_list.append('filterfiles_placeholder') ret = self.__urlretrieve( - 'https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta', + 'https://paddle-docker-tar.bj.bcebos.com/pre_test_tmp/prec_delta', 'prec_delta', ) if ret: @@ -460,7 +458,7 @@ def get_pr_ut(self): else: if ut_list: ret = self.__urlretrieve( - 'https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta', + 'https://paddle-docker-tar.bj.bcebos.com/pre_test_tmp/prec_delta', 'prec_delta', ) if ret: From 23c9c8857bb3a02028160fd012a6516ea2140b08 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Thu, 27 Oct 2022 16:28:21 +0800 Subject: [PATCH 04/91] precise_test_logic_update (#47387) * test,test=ljd_test * modify get_ut_file_map.py,test=ljd_test * modify paddle_build.sh,test=ljd_test * test,test=ljd_test * update precise test logic,test=ljd_test * it is a test,test=ljd_test * it is a test,test=ljd_test * it is a test,test=ljd_test * it is a test,test=ljd_test * it is a test,test=ljd_test * it is a test,test=ljd_test * precise test logic update --- paddle/scripts/paddle_build.sh | 34 ++++++++++++------- tools/final_ut_parallel_rule.py | 10 ------ tools/get_pr_ut.py | 3 +- tools/get_single_test_cov.py | 41 +++++++++++++++++++++-- tools/get_ut_file_map.py | 58 +++++++++++++++++++++++---------- 5 files changed, 103 insertions(+), 43 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 19598e4896a43..7042fa3dec661 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1825,16 +1825,30 @@ function precise_card_test_single { set +x testcases=$1 num=$2 - for case in $(echo $testcases | tr "$|^" "\n") + for case in $(echo $testcases | tr "$|^" "\n" | awk '!/^$/') do cd ${PADDLE_ROOT}/build precise_card_test "^${case}$" $num - # c++ + + #if test failed,continue,if test succeed ,go on + if_test_failed=$(cat $tmp_dir/^${case}$.log| grep "The following tests FAILED:") + if [[ "$if_test_failed" == "The following tests FAILED:" ]];then + echo "$testcases has failed,put it into prec_delta" + continue + else + echo "$testcases succeed" + fi + + # c++ if [ ! -d "${PADDLE_ROOT}/build/ut_map/$case" ];then mkdir ${PADDLE_ROOT}/build/ut_map/$case fi set -x find paddle/fluid -name '*.gcda'|xargs -I {} cp --path {} ut_map/$case + find paddle/phi -name '*.gcda'|xargs -I {} cp --path {} ut_map/$case + find paddle/utils -name '*.gcda'|xargs -I {} cp --path {} ut_map/$case + find paddle/phi -name '*.gcno'|xargs -I {} cp --path {} ut_map/$case + find paddle/utils -name '*.gcno'|xargs -I {} cp --path {} ut_map/$case find paddle/fluid -name '*.gcno'|xargs -I {} cp --path {} ut_map/$case python ${PADDLE_ROOT}/tools/get_single_test_cov.py ${PADDLE_ROOT} $case & @@ -1847,7 +1861,9 @@ function precise_card_test_single { fi mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case fi - find paddle/fluid -name *.gcda | xargs rm -f #delete gcda + find paddle/fluid -name *.gcda | xargs rm -f + find paddle/phi -name *.gcda | xargs rm -f + find paddle/utils -name *.gcda | xargs rm -f done } @@ -2009,12 +2025,12 @@ set -x #get notSuccessut including the failed uniitests and not executed unittests python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_not_success_ut' ${PADDLE_ROOT} + #rerun the notSuccessut and get the mapping between notSuccessut and .cu files + get_failedUts_precise_map_file + #analyze the mapping between unit tests and .cu files python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'analy_h_cu_file' $tmp_dir ${PADDLE_ROOT} - wait; - #rerun the notSuccessut and get the mapping between notSuccessut and .cu files - get_failedUts_precise_map_file #generate python coverage and generate python file to tests_map_file python ${PADDLE_ROOT}/tools/pyCov_multithreading.py ${PADDLE_ROOT} @@ -2117,12 +2133,6 @@ function get_failedUts_precise_map_file { if [[ -f "${PADDLE_ROOT}/build/utNotSuccess" ]]; then rerun_tests=`cat ${PADDLE_ROOT}/build/utNotSuccess` #remove pile to full h/cu file - python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'remove_pile_from_h_file' ${PADDLE_ROOT} - cd ${PADDLE_ROOT}/build - cmake_base ${PYTHON_ABI:-""} - build ${parallel_number} - pip uninstall -y paddlepaddle-gpu - pip install ${PADDLE_ROOT}/build/python/dist/*whl precise_card_test_single "$rerun_tests" wait; diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py index 4d98eee41fb97..bc819b59d8fd1 100644 --- a/tools/final_ut_parallel_rule.py +++ b/tools/final_ut_parallel_rule.py @@ -141,16 +141,6 @@ def classify_cases_by_mem(rootPath): case_mem_1_line = case_mem_1_line + '|^' + index[0] + '$' else: f_not_0.write(case_mem_1_line + '\n') - ''' - if len(always_timeout_list - ) != 0 and cardType == 'single_card_tests' and count > 25: - f.write(case_mem_1_line + '|^%s$\n' % - always_timeout_list[0]) - always_timeout_list.pop(0) - else: - f.write(case_mem_1_line + '\n') - count += 1 - ''' case_mem_1_line = '^job$|^' + index[0] + '$' mem_1_sum = index[1] f_not_0.write(case_mem_1_line + '\n') diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 4dbd32d962636..990a0298b0c62 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -464,7 +464,8 @@ def get_pr_ut(self): if ret: with open('prec_delta') as delta: for ut in delta: - ut_list.append(ut.rstrip('\r\n')) + if ut not in ut_list: + ut_list.append(ut.rstrip('\r\n')) else: print('PREC download prec_delta failed') exit(1) diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py index 293fed5e4b3b9..266872feaf4e3 100644 --- a/tools/get_single_test_cov.py +++ b/tools/get_single_test_cov.py @@ -21,7 +21,12 @@ def getFNDAFile(rootPath, test): filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, test) fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test) os.system('touch %s' % fn_filename) - f = open(filename) + try: + f = open(filename) + print("oepn %s succesfully" % filename) + except FileNotFoundError: + print("%s is not found." % filename) + return lines = f.readlines() for line in lines: line = line.replace('\n', '') @@ -47,8 +52,22 @@ def analysisFNDAFile(rootPath, test): ) os.system('touch %s' % related_ut_map_file) os.system('touch %s' % notrelated_ut_map_file) + + if os.path.isfile(related_ut_map_file) and os.path.isfile( + notrelated_ut_map_file + ): + print("make related.txt and not_related.txt succesfully") + else: + print("make related.txt and not_related.txt failed") + return + fn_filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, test) - f = open(fn_filename) + try: + f = open(fn_filename) + print("oepn %s succesfully" % fn_filename) + except FileNotFoundError: + print("%s is not found." % fn_filename) + return data = f.read().split('SF:') related_file_list = [] for message in data: @@ -99,14 +118,30 @@ def getCovinfo(rootPath, test): 'cd %s && lcov --capture -d . -o coverage.info --rc lcov_branch_coverage=0 > /dev/null 2>&1' % ut_map_path ) + coverage_info_path = ut_map_path + '/coverage.info' + file_size = os.path.getsize(coverage_info_path) + if file_size == 0: + print("coverage.info is empty,collect coverage rate failed") + return + else: + print("get coverage.info succesfully") os.system( - "cd %s && lcov --extract coverage.info '/paddle/paddle/fluid/framework/*' '/paddle/paddle/fluid/imperative/*' '/paddle/paddle/fluid/inference/*' '/paddle/paddle/fluid/memory/*' '/paddle/paddle/fluid/operators/*' '/paddle/paddle/fluid/string/*' '/paddle/paddle/fluid/distributed/*' '/paddle/paddle/fluid/platform/*' '/paddle/paddle/fluid/pybind/*' '/paddle/build/*' -o coverage.info.tmp --rc lcov_branch_coverage=0 > /dev/null 2>&1" + "cd %s && lcov --extract coverage.info '/paddle/paddle/phi/*' '/paddle/paddle/utils/*' '/paddle/paddle/fluid/framework/*' '/paddle/paddle/fluid/imperative/*' '/paddle/paddle/fluid/inference/*' '/paddle/paddle/fluid/memory/*' '/paddle/paddle/fluid/operators/*' '/paddle/paddle/fluid/string/*' '/paddle/paddle/fluid/distributed/*' '/paddle/paddle/fluid/platform/*' '/paddle/paddle/fluid/pybind/*' '/paddle/build/*' -o coverage.info.tmp --rc lcov_branch_coverage=0 > /dev/null 2>&1" % ut_map_path ) + coverage_info_tmp = ut_map_path + '/coverage.info.tmp' + coverage_tmp_size = os.path.getsize(coverage_info_tmp) + if coverage_tmp_size == 0: + print("coverage.info.tmp is empty,collect coverage rate failed") + return + else: + print("get coverage.info.tmp succesfully") + os.system('rm -rf %s/paddle' % ut_map_path) os.system('rm -rf %s/coverage.info' % ut_map_path) getFNDAFile(rootPath, test) analysisFNDAFile(rootPath, test) + os.system('rm -rf %s/coverage.info.tmp' % ut_map_path) if __name__ == "__main__": diff --git a/tools/get_ut_file_map.py b/tools/get_ut_file_map.py index 2bd777cc8276e..269d844d3292b 100644 --- a/tools/get_ut_file_map.py +++ b/tools/get_ut_file_map.py @@ -62,13 +62,33 @@ def handle_ut_file_map(rootPath): ut_file_map = {} count = 0 not_success_file = open("%s/build/prec_delta" % rootPath, 'w') + # if testdir is not made,write the test into prec_delta + get_all_uts(rootPath) + all_ut = '%s/build/all_uts_paddle' % rootPath + with open(all_ut, 'r') as f: + all_ut_list = [] + for ut in f.readlines(): + ut = ut.replace('\n', '') + all_ut_list.append(ut.strip()) + f.close() + for ut in all_ut_list: + filedir = '%s/build/ut_map/%s' % (rootPath, ut) + if not os.path.exists(filedir): + not_success_file.write('%s\n' % ut) + utNotSuccess_list.append(ut) + # if fnda.tmp not exists,write the test into prec_delta for ut in files: count = count + 1 print("ut %s: %s" % (count, ut)) - coverage_info = '%s/%s/coverage.info.tmp' % (ut_map_path, ut) + coverage_info = '%s/%s/fnda.tmp' % (ut_map_path, ut) if os.path.exists(coverage_info): filename = '%s/%s/related_%s.txt' % (ut_map_path, ut, ut) - f = open(filename) + try: + f = open(filename) + print("oepn %s succesfully" % filename) + except FileNotFoundError: + print("%s is not found." % filename) + return lines = f.readlines() for line in lines: line = line.replace('\n', '').strip() @@ -87,6 +107,7 @@ def handle_ut_file_map(rootPath): ut_file_map[source_file] = [] if ut not in ut_file_map[source_file]: ut_file_map[source_file].append(ut) + f.close() else: not_success_file.write('%s\n' % ut) utNotSuccess_list.append(ut) @@ -98,7 +119,11 @@ def handle_ut_file_map(rootPath): for ut in files: if ut not in utNotSuccess_list: filename = '%s/%s/notrelated_%s.txt' % (ut_map_path, ut, ut) - f = open(filename) + try: + f = open(filename) + print("oepn %s succesfully" % filename) + except FileNotFoundError: + print("%s is not found." % filename) lines = f.readlines() for line in lines: line = line.replace('\n', '').strip() @@ -110,7 +135,7 @@ def handle_ut_file_map(rootPath): source_file = line if source_file not in ut_file_map: ut_file_map[source_file] = [] - + f.close() with open("%s/build/ut_file_map.json" % rootPath, "w") as f: json.dump(ut_file_map, f, indent=4) @@ -122,7 +147,7 @@ def notsuccessfuc(rootPath): count = 0 # ut failed!! for ut in files: - coverage_info = '%s/%s/coverage.info.tmp' % (ut_map_path, ut) + coverage_info = '%s/%s/fnda.tmp' % (ut_map_path, ut) if os.path.exists(coverage_info): pass else: @@ -130,6 +155,7 @@ def notsuccessfuc(rootPath): utNotSuccess = utNotSuccess + '^%s$|' % ut # ut not exec + get_all_uts(rootPath) with open("/paddle/build/all_uts_paddle", "r") as f: data = f.readlines() @@ -149,35 +175,32 @@ def notsuccessfuc(rootPath): def ut_file_map_supplement(rootPath): ut_file_map_new = "%s/build/ut_file_map.json" % rootPath - os.system('mkdir /pre_test') + os.system('mkdir /pre_test_tmp') os.system( - 'cd /pre_test && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json --no-check-certificate' + 'cd /pre_test_tmp && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json --no-check-certificate' ) - ut_file_map_old = "/pre_test/ut_file_map.json" + ut_file_map_old = "/pre_test_tmp/ut_file_map.json" with open(ut_file_map_new, 'r') as load_f: load_dict_new = json.load(load_f) with open(ut_file_map_old, 'r') as f: load_dict_old = json.load(f) all_uts_paddle = '%s/build/all_uts_paddle' % rootPath + with open(all_uts_paddle, 'r') as f: all_uts_paddle_list = [] for ut in f.readlines(): all_uts_paddle_list.append(ut.strip()) f.close() - for filename in load_dict_old: - if filename not in load_dict_new: - load_dict_new[filename] = load_dict_old[filename] - - with open("/pre_test/ut_file_map.json", "w") as f: + with open("/pre_test_tmp/ut_file_map.json", "w") as f: json.dump(load_dict_new, f, indent=4) print("load_dict_new success!!") os.system( - 'cd /pre_test && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta --no-check-certificate' + 'cd /pre_test_tmp && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta --no-check-certificate' ) - prec_delta_old = '/pre_test/prec_delta' + prec_delta_old = '/pre_test_tmp/prec_delta' prec_delta_new = "%s/build/prec_delta" % rootPath with open(prec_delta_old, 'r') as f: prec_delta_old_list = [] @@ -189,15 +212,16 @@ def ut_file_map_supplement(rootPath): for ut in f.readlines(): prec_delta_new_list.append(ut.strip()) f.close() + for ut in prec_delta_old_list: - filename = '%s/build/ut_map/%s/coverage.info.tmp' % (rootPath, ut) + filename = '%s/build/ut_map/%s/fnda.tmp' % (rootPath, ut) if ut in all_uts_paddle_list: if not os.path.exists(filename) and ut not in prec_delta_new_list: prec_delta_new_list.append(ut) prec_delta_new_list.append( 'test_py_reader_error_msg' ) # add a python case for pycoverage - prec_delta_file = open("/pre_test/prec_delta", 'w') + prec_delta_file = open("/pre_test_tmp/prec_delta", 'w') for ut in prec_delta_new_list: prec_delta_file.write(ut + '\n') print("prec_delta_file success!!") From 8607a180e1adb66e286336165bd4ddc4c1a8ec8e Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 27 Oct 2022 16:30:58 +0800 Subject: [PATCH 05/91] clean abs cudnn (#47374) --- paddle/phi/api/yaml/op_compat.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 461f0c7bfe2f4..e40326cf6e2d4 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -6,7 +6,7 @@ - op : abs backward : abs_grad extra : - attrs : [bool use_cudnn = false, bool use_mkldnn = false] + attrs : [bool use_mkldnn = false] - op : acosh backward : acosh_grad From 4d5c8a69a99f35643e737060fb7b9ce5b6ff87d0 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 27 Oct 2022 16:31:06 +0800 Subject: [PATCH 06/91] clean angle cudnn (#47375) --- paddle/phi/api/yaml/op_compat.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index e40326cf6e2d4..1cc0f6a8ae2e9 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -32,7 +32,7 @@ - op : angle backward : angle_grad extra : - attrs : [bool use_cudnn = false, bool use_mkldnn = false] + attrs : [bool use_mkldnn = false] - op : asinh backward : asinh_grad From 539f30061e1feb4ddbe17b9ce7a82679db18e395 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 27 Oct 2022 16:31:14 +0800 Subject: [PATCH 07/91] clean gelu cudnn (#47378) --- paddle/phi/api/yaml/op_compat.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 1cc0f6a8ae2e9..cf47c79cb6d67 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -375,7 +375,7 @@ - op : gelu backward : gelu_grad extra : - attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_cudnn = false] + attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"] - op : grad_add extra : From 2096448bb1268ffef91be4e013842cc685605683 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 27 Oct 2022 17:06:22 +0800 Subject: [PATCH 08/91] make all cpp tests dynamic linked to libpaddle.so [except windows] (#47088) * make all cpp tests dynamic linked to libpaddle.so * add comments * keep old cc_test for some tests * fix some ut * make some ut use cc_test_old * fix typos and fit for win32 * fix lib path * fix some tests * skip lite test * fit for rocm * fit for cinn * fit for mac * fit for win32 * skip inference ut * skip windows * fix coverage --- CMakeLists.txt | 1 + cmake/generic.cmake | 79 +++++-- paddle/CMakeLists.txt | 129 +++++++++- .../fleet_executor/test/CMakeLists.txt | 65 +++--- paddle/fluid/distributed/test/CMakeLists.txt | 141 +++++++---- .../tests/data_structure_tests/CMakeLists.txt | 36 +-- .../tests/performance_tests/CMakeLists.txt | 40 +++- paddle/fluid/framework/CMakeLists.txt | 22 +- paddle/fluid/framework/details/CMakeLists.txt | 20 +- .../framework/details/build_strategy_test.cc | 10 +- paddle/fluid/framework/ir/CMakeLists.txt | 45 ++-- paddle/fluid/framework/ir/graph_test.cc | 18 +- ...conv_activation_mkldnn_fuse_pass_tester.cc | 2 + .../framework/new_executor/CMakeLists.txt | 20 +- .../framework/paddle2cinn/CMakeLists.txt | 67 +++--- .../framework/var_type_inference_test.cc | 4 +- paddle/fluid/inference/CMakeLists.txt | 5 + paddle/fluid/inference/api/CMakeLists.txt | 23 +- paddle/fluid/inference/lite/CMakeLists.txt | 19 +- paddle/fluid/inference/utils/CMakeLists.txt | 15 +- paddle/fluid/inference/utils/table_printer.cc | 4 +- paddle/fluid/jit/CMakeLists.txt | 11 +- paddle/fluid/memory/allocation/CMakeLists.txt | 35 +-- .../fluid/operators/benchmark/CMakeLists.txt | 2 +- .../operators/benchmark/op_tester_config.cc | 220 +----------------- .../operators/benchmark/op_tester_config.h | 216 +++++++++++++++++ paddle/fluid/operators/cinn/CMakeLists.txt | 47 ++-- .../fluid/operators/copy_cross_scope_test.cc | 1 - paddle/fluid/operators/lite/CMakeLists.txt | 6 +- .../operators/mkldnn/nhwc_op_tests.cmake | 28 +-- .../fluid/operators/prim_ops/CMakeLists.txt | 5 +- paddle/fluid/operators/pscore/CMakeLists.txt | 86 +++---- paddle/fluid/pybind/CMakeLists.txt | 9 + paddle/phi/tests/core/CMakeLists.txt | 11 +- paddle/phi/tests/ops/test_op_signature.cc | 150 ++++++------ paddle/testing/CMakeLists.txt | 8 + 36 files changed, 915 insertions(+), 685 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e125063f15a2f..f2489526c5c89 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ endif() # use to get_property location of static lib # https://cmake.org/cmake/help/v3.0/policy/CMP0026.html?highlight=cmp0026 cmake_policy(SET CMP0026 OLD) +cmake_policy(SET CMP0079 NEW) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 6b2aabc027977..e4cd6ac73bf71 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -486,14 +486,17 @@ endfunction() function(cc_test_run TARGET_NAME) if(WITH_TESTING) - set(oneValueArgs "") + set(oneValueArgs DIR) set(multiValueArgs COMMAND ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(cc_test_DIR STREQUAL "") + set(cc_test_DIR ${CMAKE_CURRENT_BINARY_DIR}) + endif() add_test( NAME ${TARGET_NAME} COMMAND ${cc_test_COMMAND} ${cc_test_ARGS} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + WORKING_DIRECTORY ${cc_test_DIR}) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT @@ -513,7 +516,57 @@ function(cc_test_run TARGET_NAME) endif() endfunction() +set_property(GLOBAL PROPERTY TEST_SRCS "") +set_property(GLOBAL PROPERTY TEST_NAMES "") function(cc_test TARGET_NAME) + if(WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS ARGS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + if(WIN32) + # NOTE(zhiqiu): on windows platform, the symbols should be exported + # explicitly by __declspec(dllexport), however, there are serveral + # symbols not exported, and link error occurs. + # so, the tests are not built against dynamic libraries now. + cc_test_old( + ${TARGET_NAME} + SRCS + ${cc_test_SRCS} + DEPS + ${cc_test_DEPS} + ARGS + ${cc_test_ARGS}) + else() + list(LENGTH cc_test_SRCS len) + # message("cc_test_SRCS ${cc_test_SRCS}") + # message("cc_test_ARGS ${cc_test_ARGS}") + + if(${len} GREATER 1) + message( + SEND_ERROR + "The number source file of cc_test should be 1, but got ${len}, the source files are: ${cc_test_SRCS}" + ) + endif() + + list(LENGTH cc_test_ARGS len_arg) + if(len_arg GREATER_EQUAL 1) + set_property(GLOBAL PROPERTY "${TARGET_NAME}_ARGS" "${cc_test_ARGS}") + #message("${TARGET_NAME}_ARGS arg ${arg}") + endif() + + get_property(test_srcs GLOBAL PROPERTY TEST_SRCS) + set(test_srcs ${test_srcs} "${CMAKE_CURRENT_SOURCE_DIR}/${cc_test_SRCS}") + set_property(GLOBAL PROPERTY TEST_SRCS "${test_srcs}") + + get_property(test_names GLOBAL PROPERTY TEST_NAMES) + set(test_names ${test_names} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY TEST_NAMES "${test_names}") + endif() + endif() +endfunction() + +function(cc_test_old TARGET_NAME) if(WITH_TESTING) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) @@ -626,25 +679,9 @@ function(nv_test TARGET_NAME) # Reference: https://cmake.org/cmake/help/v3.10/module/FindCUDA.html add_executable(${TARGET_NAME} ${nv_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - target_link_libraries( - ${TARGET_NAME} - ${nv_test_DEPS} - paddle_gtest_main - lod_tensor - memory - gtest - gflags - glog - ${os_dependency_modules}) - add_dependencies( - ${TARGET_NAME} - ${nv_test_DEPS} - paddle_gtest_main - lod_tensor - memory - gtest - gflags - glog) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} + ${os_dependency_modules} paddle_gtest_main) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 07041455df4fd..895bd9db9cf64 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,9 +1,132 @@ -add_subdirectory(utils) -add_subdirectory(scripts) -add_subdirectory(testing) +set(CC_TESTS_DIR + ${PADDLE_BINARY_DIR}/paddle/tests + CACHE INTERNAL "c++ tests directory") set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") + +add_subdirectory(utils) +add_subdirectory(scripts) +add_subdirectory(testing) + add_subdirectory(phi) add_subdirectory(infrt) add_subdirectory(fluid) + +# NOTE(zhiqiu): The changes of cc tests +# Before, (1) the source file of cc tests are distributed in different sub-directories, +# (2) the tests are added and configured by calling `cc_test()` in each `CMakeLists.txt`, +# (3) the tests links static libraries of paddle modules, +# (4) the tests binaries are generated in different directories, as the same as the +# folder of source file. + +# Now, we want to make all cc tests dynamically linked to the main paddle labrary, +# i.e., `libpaddle.so`, so we changes the logic of (2), (3), (4): +# (2) calling `cc_test()` in each `CMakeLists.txt` will not `exactly` add test, but +# record all tests and its source files, the action of add tests is defered to HERE. +# Why doing so? since the target of `libpaddle.so` is mostly the last target, and +# the tests should be added after that accroding to dependency. +# (3) the tests links dynamic libraries, `libpaddle.so` +# (4) the tests are generated to the same directory, i.e., `CC_TESTS_DIR` defined above. + +# Next, (to be discusssed) +# (1) move all source files to same folder, +# (2) naturally, and and configure tests in only one `CMakeLists.txt`, +# (3) cc tests support linking pre-built dynamic libraries. For example, use the dynamic +# library in the installed paddle by `pip`. + +# add all tests here +get_property(test_srcs GLOBAL PROPERTY TEST_SRCS) +get_property(test_names GLOBAL PROPERTY TEST_NAMES) +# message("test_srcs ${test_srcs}") + +get_property(paddle_lib GLOBAL PROPERTY PADDLE_LIB_NAME) + +set(POSTFIX ".so") +if(WIN32) + set(POSTFIX ".dll") +endif() + +list(LENGTH test_names len) +if(${len} GREATER_EQUAL 1) + message("Total tests: ${len}") + math(EXPR stop "${len} - 1") + foreach(idx RANGE ${stop}) + if(WITH_TESTING) + list(GET test_srcs ${idx} test_src) + list(GET test_names ${idx} test_name) + get_property(test_arg GLOBAL PROPERTY "${test_name}_ARGS") + message("add test ${test_name}") + add_executable(${test_name} ${test_src}) + # target_link_libraries( + # ${test_name} + # ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/libpaddle${POSTFIX}) + target_link_libraries(${test_name} $) + target_link_libraries(${test_name} paddle_gtest_main_new) + add_dependencies(${test_name} ${paddle_lib} paddle_gtest_main_new) + if(WITH_GPU) + target_link_libraries(${test_name} ${CUDA_CUDART_LIBRARY} + "-Wl,--as-needed") + endif() + if(WITH_ROCM) + target_link_libraries(${test_name} ${ROCM_HIPRTC_LIB}) + endif() + if(APPLE) + target_link_libraries(${test_name} + "-Wl,-rpath,$") + endif() + if(NOT + ("${test_name}" STREQUAL "c_broadcast_op_npu_test" + OR "${test_name}" STREQUAL "c_allreduce_sum_op_npu_test" + OR "${test_name}" STREQUAL "c_allreduce_max_op_npu_test" + OR "${test_name}" STREQUAL "c_reducescatter_op_npu_test" + OR "${test_name}" STREQUAL "c_allgather_op_npu_test" + OR "${test_name}" STREQUAL "send_v2_op_npu_test" + OR "${test_name}" STREQUAL "c_reduce_sum_op_npu_test" + OR "${test_name}" STREQUAL "recv_v2_op_npu_test")) + cc_test_run( + ${test_name} + COMMAND + ${test_name} + ARGS + ${test_arg} + DIR + ${CC_TESTS_DIR}) + endif() + elseif(WITH_TESTING AND NOT TEST ${test_name}) + add_test(NAME ${test_name} COMMAND ${CMAKE_COMMAND} -E echo CI skip + ${test_name}.) + endif() + set_target_properties(${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${CC_TESTS_DIR}") + endforeach() +endif() + +# set properties for some tests, it should be set after the tests defined. +if(TARGET standalone_executor_test) + set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100) + if(NOT WIN32) + add_dependencies(standalone_executor_test download_program) + endif() +endif() + +if(TARGET layer_test) + add_dependencies(layer_test jit_download_program) + add_dependencies(layer_test_new jit_download_program) + set_tests_properties(layer_test_new PROPERTIES ENVIRONMENT + "FLAGS_jit_engine_type=New") +endif() + +if(TEST buddy_allocator_test) + if(NOT WIN32) + add_dependencies(buddy_allocator_test download_data) + endif() + set_tests_properties(buddy_allocator_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") +endif() + +add_custom_target(build_tests) +# add target to build all cpp tests +if(${len} GREATER_EQUAL 1) + add_dependencies(build_tests ${test_names}) +endif() diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt index 0cd39b3aad6e6..a0078c675d505 100644 --- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt @@ -1,63 +1,55 @@ set_source_files_properties( interceptor_ping_pong_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - interceptor_ping_pong_test - SRCS interceptor_ping_pong_test.cc - DEPS fleet_executor ${BRPC_DEPS}) +cc_test_old(interceptor_ping_pong_test SRCS interceptor_ping_pong_test.cc DEPS + fleet_executor ${BRPC_DEPS}) set_source_files_properties( compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - compute_interceptor_test - SRCS compute_interceptor_test.cc - DEPS fleet_executor ${BRPC_DEPS}) +cc_test_old(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS + fleet_executor ${BRPC_DEPS}) set_source_files_properties( source_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - source_interceptor_test - SRCS source_interceptor_test.cc - DEPS fleet_executor ${BRPC_DEPS}) +cc_test_old(source_interceptor_test SRCS source_interceptor_test.cc DEPS + fleet_executor ${BRPC_DEPS}) set_source_files_properties( sink_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - sink_interceptor_test - SRCS sink_interceptor_test.cc - DEPS fleet_executor ${BRPC_DEPS}) +cc_test_old(sink_interceptor_test SRCS sink_interceptor_test.cc DEPS + fleet_executor ${BRPC_DEPS}) set_source_files_properties( interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - interceptor_pipeline_short_path_test - SRCS interceptor_pipeline_short_path_test.cc - DEPS fleet_executor ${BRPC_DEPS}) +cc_test_old( + interceptor_pipeline_short_path_test SRCS + interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) set_source_files_properties( interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - interceptor_pipeline_long_path_test - SRCS interceptor_pipeline_long_path_test.cc - DEPS fleet_executor ${BRPC_DEPS}) +cc_test_old( + interceptor_pipeline_long_path_test SRCS + interceptor_pipeline_long_path_test.cc DEPS fleet_executor ${BRPC_DEPS}) set_source_files_properties( compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( compute_interceptor_run_op_test - SRCS compute_interceptor_run_op_test.cc - DEPS fleet_executor - ${BRPC_DEPS} - op_registry - fill_constant_op - elementwise_add_op - scope - device_context) + SRCS + compute_interceptor_run_op_test.cc + DEPS + fleet_executor + ${BRPC_DEPS} + op_registry + fill_constant_op + elementwise_add_op + scope + device_context) if(WITH_DISTRIBUTE AND WITH_PSCORE @@ -65,8 +57,7 @@ if(WITH_DISTRIBUTE set_source_files_properties( interceptor_ping_pong_with_brpc_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test( - interceptor_ping_pong_with_brpc_test - SRCS interceptor_ping_pong_with_brpc_test.cc - DEPS fleet_executor ${BRPC_DEPS}) + cc_test_old( + interceptor_ping_pong_with_brpc_test SRCS + interceptor_ping_pong_with_brpc_test.cc DEPS fleet_executor ${BRPC_DEPS}) endif() diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 579b407495de0..30f14923e057c 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -1,108 +1,147 @@ set_source_files_properties( table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( table_test - SRCS table_test.cc - DEPS common_table table ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) + SRCS + table_test.cc + DEPS + common_table + table + ps_framework_proto + ${COMMON_DEPS} + ${RPC_DEPS}) set_source_files_properties( dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( dense_table_test - SRCS dense_table_test.cc - DEPS common_table table ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) + SRCS + dense_table_test.cc + DEPS + common_table + table + ps_framework_proto + ${COMMON_DEPS} + ${RPC_DEPS}) set_source_files_properties( barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( barrier_table_test - SRCS barrier_table_test.cc - DEPS common_table table ps_framework_proto ${COMMON_DEPS}) + SRCS + barrier_table_test.cc + DEPS + common_table + table + ps_framework_proto + ${COMMON_DEPS}) set_source_files_properties( brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( brpc_service_dense_sgd_test - SRCS brpc_service_dense_sgd_test.cc - DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS}) + SRCS + brpc_service_dense_sgd_test.cc + DEPS + scope + ps_service + table + ps_framework_proto + ${COMMON_DEPS}) set_source_files_properties( brpc_service_sparse_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( brpc_service_sparse_sgd_test - SRCS brpc_service_sparse_sgd_test.cc - DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS}) + SRCS + brpc_service_sparse_sgd_test.cc + DEPS + scope + ps_service + table + ps_framework_proto + ${COMMON_DEPS}) set_source_files_properties( brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( brpc_utils_test - SRCS brpc_utils_test.cc - DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS}) + SRCS + brpc_utils_test.cc + DEPS + brpc_utils + scope + math_function + ${COMMON_DEPS} + ${RPC_DEPS}) set_source_files_properties( graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( graph_node_test - SRCS graph_node_test.cc - DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS}) + SRCS + graph_node_test.cc + DEPS + scope + ps_service + table + ps_framework_proto + ${COMMON_DEPS}) set_source_files_properties( graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( graph_node_split_test - SRCS graph_node_split_test.cc - DEPS scope ps_service table ps_framework_proto ${COMMON_DEPS}) + SRCS + graph_node_split_test.cc + DEPS + scope + ps_service + table + ps_framework_proto + ${COMMON_DEPS}) set_source_files_properties( graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( graph_table_sample_test - SRCS graph_table_sample_test.cc - DEPS table ps_framework_proto ${COMMON_DEPS}) + SRCS + graph_table_sample_test.cc + DEPS + table + ps_framework_proto + ${COMMON_DEPS}) set_source_files_properties( feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - feature_value_test - SRCS feature_value_test.cc - DEPS ${COMMON_DEPS} table) +cc_test_old(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} + table) set_source_files_properties( sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - sparse_sgd_rule_test - SRCS sparse_sgd_rule_test.cc - DEPS ${COMMON_DEPS} table) +cc_test_old(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS + ${COMMON_DEPS} table) set_source_files_properties( ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - ctr_accessor_test - SRCS ctr_accessor_test.cc - DEPS ${COMMON_DEPS} table) +cc_test_old(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} + table) set_source_files_properties( ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - ctr_dymf_accessor_test - SRCS ctr_dymf_accessor_test.cc - DEPS ${COMMON_DEPS} table) +cc_test_old(ctr_dymf_accessor_test SRCS ctr_dymf_accessor_test.cc DEPS + ${COMMON_DEPS} table) set_source_files_properties( memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - memory_sparse_table_test - SRCS memory_sparse_table_test.cc - DEPS ${COMMON_DEPS} table) +cc_test_old(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS + ${COMMON_DEPS} table) set_source_files_properties( memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( - memory_sparse_geo_table_test - SRCS memory_geo_table_test.cc - DEPS ${COMMON_DEPS} table) +cc_test_old(memory_sparse_geo_table_test SRCS memory_geo_table_test.cc DEPS + ${COMMON_DEPS} table) diff --git a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt b/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt index 90159e9b8c32e..3c695a5168cd3 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/data_structure_tests/CMakeLists.txt @@ -1,27 +1,15 @@ -cc_test( - test_egr_ds_eager_tensor - SRCS eager_tensor_test.cc - DEPS ${eager_deps}) -cc_test( - test_egr_ds_auotgrad_meta - SRCS autograd_meta_test.cc - DEPS ${eager_deps}) -cc_test( - test_egr_ds_grad_node_info - SRCS grad_node_info_test.cc - DEPS ${eager_deps}) -cc_test( - test_egr_ds_accumulation_node - SRCS accumulation_node_test.cc - DEPS ${eager_deps}) -cc_test( - test_egr_ds_tensor_wrapper - SRCS tensor_wrapper_test.cc - DEPS ${eager_deps}) +cc_test_old(test_egr_ds_eager_tensor SRCS eager_tensor_test.cc DEPS + ${eager_deps}) +cc_test_old(test_egr_ds_auotgrad_meta SRCS autograd_meta_test.cc DEPS + ${eager_deps}) +cc_test_old(test_egr_ds_grad_node_info SRCS grad_node_info_test.cc DEPS + ${eager_deps}) +cc_test_old(test_egr_ds_accumulation_node SRCS accumulation_node_test.cc DEPS + ${eager_deps}) +cc_test_old(test_egr_ds_tensor_wrapper SRCS tensor_wrapper_test.cc DEPS + ${eager_deps}) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - cc_test( - test_egr_ds_grad_tensor_holder - SRCS grad_tensor_holder_test.cc - DEPS ${eager_deps} ${generated_deps}) + cc_test_old(test_egr_ds_grad_tensor_holder SRCS grad_tensor_holder_test.cc + DEPS ${eager_deps} ${generated_deps}) endif() diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt index 7b6dfae729f38..2b6024ce68000 100644 --- a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt @@ -10,20 +10,36 @@ cc_library( matmul_v2_op dygraph_function) -cc_test( +cc_test_old( test_egr_performance_benchmark_eager_cpu - SRCS benchmark_eager_cpu.cc - DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) -cc_test( + SRCS + benchmark_eager_cpu.cc + DEPS + performance_benchmark_utils + ${eager_deps} + ${fluid_deps}) +cc_test_old( test_egr_performance_benchmark_fluid_cpu - SRCS benchmark_fluid_cpu.cc - DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) + SRCS + benchmark_fluid_cpu.cc + DEPS + performance_benchmark_utils + ${eager_deps} + ${fluid_deps}) -cc_test( +cc_test_old( test_egr_performance_benchmark_eager_cuda - SRCS benchmark_eager_cuda.cc - DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) -cc_test( + SRCS + benchmark_eager_cuda.cc + DEPS + performance_benchmark_utils + ${eager_deps} + ${fluid_deps}) +cc_test_old( test_egr_performance_benchmark_fluid_cuda - SRCS benchmark_fluid_cuda.cc - DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps}) + SRCS + benchmark_fluid_cuda.cc + DEPS + performance_benchmark_utils + ${eager_deps} + ${fluid_deps}) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c009cfcd92ac0..590d3b482cdd9 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1156,19 +1156,29 @@ cc_library( op_compatible_info SRCS op_compatible_info.cc DEPS string_helper proto_desc) -cc_test( +cc_test_old( op_compatible_info_test - SRCS op_compatible_info_test.cc - DEPS op_compatible_info proto_desc string_helper glog) + SRCS + op_compatible_info_test.cc + DEPS + op_compatible_info + proto_desc + string_helper + glog) cc_library( save_load_util SRCS save_load_util.cc DEPS tensor scope layer) -cc_test( +cc_test_old( save_load_util_test - SRCS save_load_util_test.cc - DEPS save_load_util tensor scope layer) + SRCS + save_load_util_test.cc + DEPS + save_load_util + tensor + scope + layer) cc_library( generator SRCS generator.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 21518c4f831c7..e10a985b34833 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -322,16 +322,18 @@ cc_test( memory device_context broadcast_op_handle) -cc_test( +cc_test_old( gather_op_test - SRCS gather_op_handle_test.cc - DEPS var_handle - op_handle_base - scope - ddim - memory - device_context - gather_op_handle) + SRCS + gather_op_handle_test.cc + DEPS + var_handle + op_handle_base + scope + ddim + memory + device_context + gather_op_handle) cc_library( scope_buffered_monitor diff --git a/paddle/fluid/framework/details/build_strategy_test.cc b/paddle/fluid/framework/details/build_strategy_test.cc index 4b184ba552898..c39388fa5bc86 100644 --- a/paddle/fluid/framework/details/build_strategy_test.cc +++ b/paddle/fluid/framework/details/build_strategy_test.cc @@ -60,7 +60,7 @@ class SumOpWithKernel : public OperatorWithKernel { } // namespace framework } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT(sum, +REGISTER_OP_WITHOUT_GRADIENT(fake_sum, paddle::framework::SumOpWithKernel, paddle::framework::SumOpMaker); @@ -114,7 +114,7 @@ void BuildStrategyApply(BuildStrategy *build_strategy, ir::Graph *graph) { std::unique_ptr CreateGraph() { ProgramDesc prog; auto *op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"a1"}); op->SetOutput("Out", {"b1"}); op->SetAttr("op_role", 1); @@ -133,7 +133,7 @@ std::unique_ptr CreateMultiGraph() { // Set contents in block_0. auto *op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"test_a", "test_b", "test_c"}); op->SetOutput("Out", {"test_out"}); op->SetAttr("op_role", 1); @@ -149,7 +149,7 @@ std::unique_ptr CreateMultiGraph() { // Set contents in block_1. op = prog.MutableBlock(1)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"a1"}); op->SetOutput("Out", {"b1"}); op->SetAttr("op_role", 1); @@ -159,7 +159,7 @@ std::unique_ptr CreateMultiGraph() { // Set contents in block_2. op = prog.MutableBlock(2)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"a2"}); op->SetOutput("Out", {"b2"}); op->SetAttr("op_role", 1); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index bb8af808d27e3..279ab07ff31b0 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -384,30 +384,29 @@ if(WITH_MKLDNN) test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass) - cc_test( - test_conv_elementwise_add_mkldnn_fuse_pass - SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc - DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util) - cc_test( - test_int8_scale_calculation_mkldnn_pass - SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc - DEPS int8_scale_calculation_mkldnn_pass pass_test_util) - cc_test( - test_params_quantization_mkldnn_pass - SRCS mkldnn/params_quantization_mkldnn_pass_tester.cc - DEPS params_quantization_mkldnn_pass) - cc_test( - test_fc_elementwise_add_mkldnn_fuse_pass - SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc - DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util) - cc_test( - test_fc_act_mkldnn_fuse_pass - SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc + cc_test_old( + test_conv_elementwise_add_mkldnn_fuse_pass SRCS + mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS + conv_elementwise_add_mkldnn_fuse_pass pass_test_util) + cc_test_old( + test_int8_scale_calculation_mkldnn_pass SRCS + mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS + int8_scale_calculation_mkldnn_pass pass_test_util) + cc_test_old( + test_params_quantization_mkldnn_pass SRCS + mkldnn/params_quantization_mkldnn_pass_tester.cc DEPS + params_quantization_mkldnn_pass) + cc_test_old( + test_fc_elementwise_add_mkldnn_fuse_pass SRCS + mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS + fc_elementwise_add_mkldnn_fuse_pass pass_test_util) + cc_test_old( + test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util) - cc_test( - test_batch_norm_act_fuse_pass - SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc - DEPS batch_norm_act_fuse_pass pass_test_util) + cc_test_old( + test_batch_norm_act_fuse_pass SRCS + mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass + pass_test_util) set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index b1d550c54b4e0..b8ad98113a3a4 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -74,7 +74,7 @@ class DummyOpVarTypeInference : public VarTypeInference { } // namespace framework } // namespace paddle -REGISTER_OPERATOR(sum, +REGISTER_OPERATOR(fake_sum, paddle::framework::NOP, paddle::framework::SumOpMaker, paddle::framework::SumOpVarTypeInference); @@ -92,7 +92,7 @@ namespace framework { TEST(GraphTest, Basic) { ProgramDesc prog; auto *op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"test_a", "test_b", "test_c"}); op->SetOutput("Out", {"test_out"}); op->SetAttr("op_role", 1); @@ -115,7 +115,7 @@ TEST(GraphTest, Basic) { std::unique_ptr g(new ir::Graph(prog)); std::vector nodes(g->Nodes().begin(), g->Nodes().end()); for (ir::Node *n : nodes) { - if (n->Name() == "sum") { + if (n->Name() == "fake_sum") { ASSERT_EQ(n->inputs.size(), 3UL); ASSERT_EQ(n->outputs.size(), 1UL); } else if (n->Name() == "test_a" || n->Name() == "test_b" || @@ -242,7 +242,7 @@ TEST(GraphTest, TestMultiBlock) { // Set contents in block_0. auto *op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"test_a", "test_b", "test_c"}); op->SetOutput("Out", {"test_out"}); op->SetAttr("op_role", 1); @@ -262,7 +262,7 @@ TEST(GraphTest, TestMultiBlock) { // Set contents in block_1. op = prog.MutableBlock(1)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"a"}); op->SetOutput("Out", {"b"}); op->SetAttr("op_role", 1); @@ -280,7 +280,7 @@ TEST(GraphTest, TestMultiBlock) { // Set contents in block_2. op = prog.MutableBlock(2)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"a"}); op->SetOutput("Out", {"b"}); op->SetAttr("op_role", 1); @@ -305,7 +305,7 @@ TEST(GraphTest, TestMultiBlock) { const ir::Graph *g0 = g->GetSubGraph(0); std::vector nodes(g0->Nodes().begin(), g0->Nodes().end()); for (ir::Node *n : nodes) { - if (n->Name() == "sum") { + if (n->Name() == "fake_sum") { ASSERT_EQ(n->inputs.size(), 3UL); ASSERT_EQ(n->outputs.size(), 1UL); } else if (n->Name() == "test_a" || n->Name() == "test_b" || @@ -322,7 +322,7 @@ TEST(GraphTest, TestMultiBlock) { // Check contents in sub_graph_1. const ir::Graph *g1 = g->GetSubGraph(1); for (ir::Node *n : g1->Nodes()) { - if (n->Name() == "sum") { + if (n->Name() == "fake_sum") { ASSERT_EQ(n->outputs[0]->Name(), "b"); ASSERT_EQ(n->outputs.size(), 1UL); } @@ -335,7 +335,7 @@ TEST(GraphTest, TestMultiBlock) { // Check contents in sub_graph_2. const ir::Graph *g2 = g->GetSubGraph(2); for (ir::Node *n : g2->Nodes()) { - if (n->Name() == "sum") { + if (n->Name() == "fake_sum") { ASSERT_EQ(n->outputs[0]->Name(), "b"); ASSERT_EQ(n->outputs.size(), 1UL); } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc index 3efc0c9508bb5..326bbaf471eb7 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc @@ -52,6 +52,8 @@ void SetOp(ProgramDesc* prog, op->SetAttr("alpha", 0.02f); } else if (type == "relu6") { op->SetAttr("threshold", 6.0f); + } else if (type == "mish") { + op->SetAttr("threshold", 20.0f); } else if (type == "swish") { op->SetAttr("beta", 1.0f); } diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index ca9bf5f86de59..ea7c294df6b90 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -27,7 +27,8 @@ if(WITH_GPU COMMAND wget -nc --no-check-certificate https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program COMMAND wget -nc --no-check-certificate - https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program) + https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program + WORKING_DIRECTORY "${CC_TESTS_DIR}") # all operators used in the program set(OPS @@ -58,16 +59,11 @@ if(WITH_GPU # All deps of the operators above, part of GLOB_OPERATOR_DEPS. set(OP_DEPS generator softmax selected_rows_functor jit_kernel_helper concat_and_split cross_entropy) + cc_test(standalone_executor_test SRCS standalone_executor_test.cc) - cc_test( - standalone_executor_test - SRCS standalone_executor_test.cc - DEPS standalone_executor operator op_registry executor ${OPS} ${OP_DEPS}) - set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100) - - add_dependencies(standalone_executor_test download_program) - if(WITH_PROFILER) - target_link_libraries(standalone_executor_test profiler) - add_dependencies(standalone_executor_test profiler) - endif() + # add_dependencies(standalone_executor_test download_program) + # if(WITH_PROFILER) + # target_link_libraries(standalone_executor_test profiler) + # add_dependencies(standalone_executor_test profiler) + # endif() endif() diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index 6aced31feb2f0..57ce4d3c56b59 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -40,55 +40,52 @@ cc_library( cinn_launch_context) if(WITH_TESTING) - cc_test( - cinn_lib_test - SRCS cinn_lib_test.cc - DEPS cinn) + cc_test_old(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test( - cinn_cache_key_test - SRCS cinn_cache_key_test.cc - DEPS cinn_cache_key) + cc_test_old(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS + cinn_cache_key) set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test( + cc_test_old( build_cinn_pass_test - SRCS build_cinn_pass_test.cc - DEPS build_cinn_pass cinn_compiler op_registry mul_op activation_op - elementwise_add_op) + SRCS + build_cinn_pass_test.cc + DEPS + build_cinn_pass + cinn_compiler + op_registry + mul_op + activation_op + elementwise_add_op) set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test( - transform_desc_test - SRCS transform_desc_test.cc - DEPS transform_desc) + cc_test_old(transform_desc_test SRCS transform_desc_test.cc DEPS + transform_desc) set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test( - transform_type_test - SRCS transform_type_test.cc - DEPS transform_type) + cc_test_old(transform_type_test SRCS transform_type_test.cc DEPS + transform_type) set_tests_properties(transform_type_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test( - cinn_graph_symbolization_test - SRCS cinn_graph_symbolization_test.cc - DEPS cinn_graph_symbolization) + cc_test_old(cinn_graph_symbolization_test SRCS + cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization) set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN") - cc_test( + cc_test_old( cinn_compiler_test - SRCS cinn_compiler_test.cc - DEPS cinn_compiler - place - proto_desc - graph_viz_pass - build_cinn_pass - cinn - mul_op - activation_op - elementwise_add_op) + SRCS + cinn_compiler_test.cc + DEPS + cinn_compiler + place + proto_desc + graph_viz_pass + build_cinn_pass + cinn + mul_op + activation_op + elementwise_add_op) set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN") endif() diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 757878bae9e45..b7f7f32348ec6 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -63,7 +63,7 @@ class SumOpVarTypeInference : public VarTypeInference { } // namespace framework } // namespace paddle -REGISTER_OPERATOR(sum, +REGISTER_OPERATOR(fake_sum, paddle::framework::NOP, paddle::framework::SumOpMaker, paddle::framework::SumOpVarTypeInference); @@ -152,7 +152,7 @@ class TestStaticGraphVarTypeInference : public StaticGraphVarTypeInference { TEST(InferVarType, sum_op) { ProgramDesc prog; auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); + op->SetType("fake_sum"); op->SetInput("X", {"test_a", "test_b", "test_c"}); op->SetOutput("Out", {"test_out"}); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 728d3cf537760..33e564f097f0b 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -71,6 +71,11 @@ if(WIN32 AND WITH_GPU) cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API} ${utils_modules}) else() + # message("${fluid_modules}") + # message("PHI_MODULES ${phi_modules}") + # message("${phi_kernels}") + # message("${STATIC_INFERENCE_API}") + # message("${utils_modules}") create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} ${phi_kernels} ${STATIC_INFERENCE_API} ${utils_modules}) endif() diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 3aff5d5536a23..697daea542089 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -136,16 +136,25 @@ if(WITH_TESTING) endif() if(NOT APPLE AND NOT WIN32) - cc_test( + cc_test_old( test_analysis_predictor - SRCS analysis_predictor_tester.cc - DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) + SRCS + analysis_predictor_tester.cc + DEPS + paddle_inference_shared + ARGS + --dirname=${WORD2VEC_MODEL_DIR}) elseif(WIN32) - cc_test( + cc_test_old( test_analysis_predictor - SRCS analysis_predictor_tester.cc - DEPS analysis_predictor benchmark ${inference_deps} ARGS - --dirname=${WORD2VEC_MODEL_DIR}) + SRCS + analysis_predictor_tester.cc + DEPS + analysis_predictor + benchmark + ${inference_deps} + ARGS + --dirname=${WORD2VEC_MODEL_DIR}) endif() if(WITH_TESTING AND WITH_MKLDNN) diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt index 3f4992b8946ec..8576fe68b5491 100644 --- a/paddle/fluid/inference/lite/CMakeLists.txt +++ b/paddle/fluid/inference/lite/CMakeLists.txt @@ -14,11 +14,16 @@ cc_library( lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy ${LITE_DEPS} framework_proto device_context ${XPU_DEPS}) -cc_test( +cc_test_old( test_lite_engine - SRCS test_engine_lite.cc - DEPS lite_engine protobuf framework_proto glog gtest analysis) -cc_test( - test_lite_tensor_utils - SRCS test_tensor_utils.cc - DEPS lite_engine lite_tensor_utils) + SRCS + test_engine_lite.cc + DEPS + lite_engine + protobuf + framework_proto + glog + gtest + analysis) +cc_test_old(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine + lite_tensor_utils) diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index f165002f353e4..7abd7c25b807a 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -2,10 +2,7 @@ cc_library( benchmark SRCS benchmark.cc DEPS enforce) -cc_test( - test_benchmark - SRCS benchmark_tester.cc - DEPS benchmark) +cc_test_old(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_library( infer_io_utils SRCS io_utils.cc @@ -14,10 +11,7 @@ cc_library( model_utils SRCS model_utils.cc DEPS proto_desc enforce) -cc_test( - infer_io_utils_tester - SRCS io_utils_tester.cc - DEPS infer_io_utils) +cc_test_old(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils) if(WITH_ONNXRUNTIME AND WIN32) # Copy onnxruntime for some c++ test in Windows, since the test will @@ -26,9 +20,6 @@ if(WITH_ONNXRUNTIME AND WIN32) endif() cc_library(table_printer SRCS table_printer.cc) -cc_test( - test_table_printer - SRCS table_printer_tester.cc - DEPS table_printer) +cc_test_old(test_table_printer SRCS table_printer_tester.cc DEPS table_printer) proto_library(shape_range_info_proto SRCS shape_range_info.proto) diff --git a/paddle/fluid/inference/utils/table_printer.cc b/paddle/fluid/inference/utils/table_printer.cc index 628465c423b03..83dcef6cfb425 100644 --- a/paddle/fluid/inference/utils/table_printer.cc +++ b/paddle/fluid/inference/utils/table_printer.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/inference/utils/table_printer.h" -#ifdef WIN32 +#ifdef _WIN32 // suppress the min and max definitions in Windef.h. #define NOMINMAX #include @@ -58,7 +58,7 @@ std::string TablePrinter::PrintTable() { TablePrinter::TablePrinter(const std::vector& header) { size_t terminal_witdh = 500; -#ifdef WIN32 +#ifdef _WIN32 CONSOLE_SCREEN_BUFFER_INFO csbi; int ret = GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); if (ret && (csbi.dwSize.X != 0)) { diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index 872845d07999c..f47de23b0e165 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -56,7 +56,8 @@ if(WITH_TESTING AND NOT WIN32) COMMAND wget -nc -q --no-check-certificate https://paddle-ci.gz.bcebos.com/dy2st/multi_program_load_with_property.tar.gz - COMMAND tar zxf multi_program_load_with_property.tar.gz) + COMMAND tar zxf multi_program_load_with_property.tar.gz + WORKING_DIRECTORY "${CC_TESTS_DIR}") set(JIT_DEPS phi phi_api @@ -73,13 +74,13 @@ if(WITH_TESTING AND NOT WIN32) layer_test SRCS layer_test.cc DEPS ${JIT_DEPS}) - add_dependencies(layer_test jit_download_program) + # add_dependencies(layer_test jit_download_program) cc_test( layer_test_new SRCS layer_test.cc DEPS ${JIT_DEPS}) - add_dependencies(layer_test_new jit_download_program) - set_tests_properties(layer_test_new PROPERTIES ENVIRONMENT - "FLAGS_jit_engine_type=New") + # add_dependencies(layer_test_new jit_download_program) + # set_tests_properties(layer_test_new PROPERTIES ENVIRONMENT + # "FLAGS_jit_engine_type=New") endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 515524f68bd18..8eff7a3fc881b 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -75,10 +75,8 @@ cc_test( naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS allocator) -cc_test( - buffered_allocator_test - SRCS buffered_allocator_test.cc - DEPS allocator) +cc_test_old(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS + allocator) if(WITH_GPU) nv_test( @@ -104,21 +102,14 @@ elseif(WITH_ROCM) SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu DEPS allocator memcpy) else() - cc_test( - best_fit_allocator_test - SRCS best_fit_allocator_test.cc - DEPS allocator) + cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS + allocator) endif() -cc_test( - test_aligned_allocator - SRCS test_aligned_allocator.cc - DEPS allocator) +cc_test_old(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS + allocator) -cc_test( - retry_allocator_test - SRCS retry_allocator_test.cc - DEPS allocator) +cc_test_old(retry_allocator_test SRCS retry_allocator_test.cc DEPS allocator) if(TEST retry_allocator_test) set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") @@ -138,10 +129,8 @@ cc_test( auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS allocator) -cc_test( - auto_growth_best_fit_allocator_test - SRCS auto_growth_best_fit_allocator_test.cc - DEPS allocator) +cc_test_old(auto_growth_best_fit_allocator_test SRCS + auto_growth_best_fit_allocator_test.cc DEPS allocator) if(NOT WIN32) cc_test( @@ -161,11 +150,6 @@ cc_test( DEPS allocator) if(WITH_TESTING) - if(TEST buddy_allocator_test) - set_tests_properties(buddy_allocator_test PROPERTIES LABELS - "RUN_TYPE=EXCLUSIVE") - endif() - # TODO(zhiqiu): why not win32? because wget is not found on windows if(NOT WIN32) add_custom_target( @@ -173,6 +157,5 @@ if(WITH_TESTING) COMMAND wget -nc --no-check-certificate https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar COMMAND tar -xf buddy_allocator_test_data.tar) - add_dependencies(buddy_allocator_test download_data) endif() endif() diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt index b0a1c488f047c..a5a799d71da94 100644 --- a/paddle/fluid/operators/benchmark/CMakeLists.txt +++ b/paddle/fluid/operators/benchmark/CMakeLists.txt @@ -1,6 +1,6 @@ cc_test( op_tester - SRCS op_tester.cc op_tester_config.cc + SRCS op_tester.cc DEPS memory timer framework_proto diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc index bb17808047e34..a7370e30f2f65 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.cc +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -20,224 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace benchmark { - -static const char kStartSeparator[] = "{"; -static const char kEndSeparator[] = "}"; -static const char kSepBetweenItems[] = ";"; - -static bool StartWith(const std::string& str, const std::string& substr) { - return str.find(substr) == 0; -} - -static bool EndWith(const std::string& str, const std::string& substr) { - return str.rfind(substr) == (str.length() - substr.length()); -} - -static void EraseEndSep(std::string* str, - std::string substr = kSepBetweenItems) { - if (EndWith(*str, substr)) { - str->erase(str->length() - substr.length(), str->length()); - } -} - -OpInputConfig::OpInputConfig(std::istream& is) { - std::string sep; - is >> sep; - if (sep == kStartSeparator) { - while (sep != kEndSeparator) { - is >> sep; - if (sep == "name" || sep == "name:") { - is >> name; - EraseEndSep(&name); - } else if (sep == "dtype" || sep == "dtype:") { - ParseDType(is); - } else if (sep == "initializer" || sep == "initializer:") { - ParseInitializer(is); - } else if (sep == "dims" || sep == "dims:") { - ParseDims(is); - } else if (sep == "lod" || sep == "lod:") { - ParseLoD(is); - } else if (sep == "filename") { - is >> filename; - EraseEndSep(&filename); - } - } - } -} - -void OpInputConfig::ParseDType(std::istream& is) { - std::string dtype_str; - is >> dtype_str; - EraseEndSep(&dtype_str); - - if (dtype_str == "int32" || dtype_str == "int") { - dtype = "int32"; - } else if (dtype_str == "int64" || dtype_str == "long") { - dtype = "int64"; - } else if (dtype_str == "fp32" || dtype_str == "float") { - dtype = "fp32"; - } else if (dtype_str == "fp64" || dtype_str == "double") { - dtype = "fp64"; - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str())); - } - VLOG(4) << "dtype of input " << name << " is: " << dtype; -} - -void OpInputConfig::ParseInitializer(std::istream& is) { - std::string initializer_str; - is >> initializer_str; - EraseEndSep(&initializer_str); - - const std::vector supported_initializers = { - "random", "natural", "zeros", "file"}; - if (!Has(supported_initializers, initializer_str)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported initializer %s in OpInputConfig.", - initializer_str.c_str())); - } - - initializer = initializer_str; - VLOG(4) << "initializer of input " << name << " is: " << initializer; -} - -void OpInputConfig::ParseDims(std::istream& is) { - std::string dims_str; - is >> dims_str; - - dims.clear(); - std::string token; - std::istringstream token_stream(dims_str); - while (std::getline(token_stream, token, 'x')) { - dims.push_back(std::stoi(token)); - } -} - -void OpInputConfig::ParseLoD(std::istream& is) { - std::string lod_str; - std::string start_sep = - std::string(kStartSeparator) + std::string(kStartSeparator); - std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator); - - std::string sep; - is >> sep; - if (StartWith(sep, start_sep)) { - lod_str += sep; - while (!EndWith(sep, end_sep)) { - is >> sep; - lod_str += sep; - } - } - EraseEndSep(&lod_str); - PADDLE_ENFORCE_GE( - lod_str.length(), - 4U, - platform::errors::InvalidArgument( - "The length of lod string should be " - "equal to or larger than 4. But length of lod string is %zu.", - lod_str.length())); - VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); - - // Parse the lod_str - lod.clear(); - for (size_t i = 1; i < lod_str.length() - 1;) { - if (lod_str[i] == '{') { - std::vector level; - while (lod_str[i] != '}') { - ++i; - - std::string number; - while (lod_str[i] >= '0' && lod_str[i] <= '9') { - number += lod_str[i]; - ++i; - } - level.push_back(StringTo(number)); - } - lod.push_back(level); - } else if (lod_str[i] == '}') { - ++i; - } - } -} - -OpTesterConfig::OpTesterConfig(const std::string& filename) { - std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fin), - true, - platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.", - filename.c_str())); - - Init(fin); -} - -bool OpTesterConfig::Init(std::istream& is) { - std::string sep; - is >> sep; - if (sep == kStartSeparator) { - while (sep != kEndSeparator) { - is >> sep; - if (sep == "op_type" || sep == "op_type:") { - is >> op_type; - } else if (sep == "device_id" || sep == "device_id:") { - is >> device_id; - } else if (sep == "repeat" || sep == "repeat:") { - is >> repeat; - } else if (sep == "profile" || sep == "profile:") { - is >> profile; - } else if (sep == "print_debug_string" || sep == "print_debug_string:") { - is >> print_debug_string; - } else if (sep == "input" || sep == "input:") { - OpInputConfig input_config(is); - inputs.push_back(input_config); - } else if (sep == "attrs" || sep == "attrs:") { - ParseAttrs(is); - } else { - if (sep != kEndSeparator) { - return false; - } - } - } - } else { - return false; - } - return true; -} - -bool OpTesterConfig::ParseAttrs(std::istream& is) { - std::string sep; - is >> sep; - if (sep == kStartSeparator) { - while (true) { - std::string key; - is >> key; - if (key == kEndSeparator) { - break; - } - - std::string value; - is >> value; - EraseEndSep(&key, ":"); - EraseEndSep(&value); - VLOG(4) << "attrs: " << key << ", " << value; - - attrs[key] = value; - } - } - return true; -} - -const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) { - for (size_t i = 0; i < inputs.size(); ++i) { - if (inputs[i].name == name) { - return &inputs[i]; - } - } - return nullptr; -} - -} // namespace benchmark +namespace benchmark {} // namespace benchmark } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h index 3956bc0a8b108..e54bb8864b523 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.h +++ b/paddle/fluid/operators/benchmark/op_tester_config.h @@ -78,6 +78,222 @@ T StringTo(const std::string& str) { return value; } +static const char kStartSeparator[] = "{"; +static const char kEndSeparator[] = "}"; +static const char kSepBetweenItems[] = ";"; + +static bool StartWith(const std::string& str, const std::string& substr) { + return str.find(substr) == 0; +} + +static bool EndWith(const std::string& str, const std::string& substr) { + return str.rfind(substr) == (str.length() - substr.length()); +} + +static void EraseEndSep(std::string* str, + std::string substr = kSepBetweenItems) { + if (EndWith(*str, substr)) { + str->erase(str->length() - substr.length(), str->length()); + } +} + +OpInputConfig::OpInputConfig(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "name" || sep == "name:") { + is >> name; + EraseEndSep(&name); + } else if (sep == "dtype" || sep == "dtype:") { + ParseDType(is); + } else if (sep == "initializer" || sep == "initializer:") { + ParseInitializer(is); + } else if (sep == "dims" || sep == "dims:") { + ParseDims(is); + } else if (sep == "lod" || sep == "lod:") { + ParseLoD(is); + } else if (sep == "filename") { + is >> filename; + EraseEndSep(&filename); + } + } + } +} + +void OpInputConfig::ParseDType(std::istream& is) { + std::string dtype_str; + is >> dtype_str; + EraseEndSep(&dtype_str); + + if (dtype_str == "int32" || dtype_str == "int") { + dtype = "int32"; + } else if (dtype_str == "int64" || dtype_str == "long") { + dtype = "int64"; + } else if (dtype_str == "fp32" || dtype_str == "float") { + dtype = "fp32"; + } else if (dtype_str == "fp64" || dtype_str == "double") { + dtype = "fp64"; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str())); + } + VLOG(4) << "dtype of input " << name << " is: " << dtype; +} + +void OpInputConfig::ParseInitializer(std::istream& is) { + std::string initializer_str; + is >> initializer_str; + EraseEndSep(&initializer_str); + + const std::vector supported_initializers = { + "random", "natural", "zeros", "file"}; + if (!Has(supported_initializers, initializer_str)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported initializer %s in OpInputConfig.", + initializer_str.c_str())); + } + + initializer = initializer_str; + VLOG(4) << "initializer of input " << name << " is: " << initializer; +} + +void OpInputConfig::ParseDims(std::istream& is) { + std::string dims_str; + is >> dims_str; + + dims.clear(); + std::string token; + std::istringstream token_stream(dims_str); + while (std::getline(token_stream, token, 'x')) { + dims.push_back(std::stoi(token)); + } +} + +void OpInputConfig::ParseLoD(std::istream& is) { + std::string lod_str; + std::string start_sep = + std::string(kStartSeparator) + std::string(kStartSeparator); + std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator); + + std::string sep; + is >> sep; + if (StartWith(sep, start_sep)) { + lod_str += sep; + while (!EndWith(sep, end_sep)) { + is >> sep; + lod_str += sep; + } + } + EraseEndSep(&lod_str); + PADDLE_ENFORCE_GE( + lod_str.length(), + 4U, + platform::errors::InvalidArgument( + "The length of lod string should be " + "equal to or larger than 4. But length of lod string is %zu.", + lod_str.length())); + VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); + + // Parse the lod_str + lod.clear(); + for (size_t i = 1; i < lod_str.length() - 1;) { + if (lod_str[i] == '{') { + std::vector level; + while (lod_str[i] != '}') { + ++i; + + std::string number; + while (lod_str[i] >= '0' && lod_str[i] <= '9') { + number += lod_str[i]; + ++i; + } + level.push_back(StringTo(number)); + } + lod.push_back(level); + } else if (lod_str[i] == '}') { + ++i; + } + } +} + +OpTesterConfig::OpTesterConfig(const std::string& filename) { + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), + true, + platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.", + filename.c_str())); + + Init(fin); +} + +bool OpTesterConfig::Init(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "op_type" || sep == "op_type:") { + is >> op_type; + } else if (sep == "device_id" || sep == "device_id:") { + is >> device_id; + } else if (sep == "repeat" || sep == "repeat:") { + is >> repeat; + } else if (sep == "profile" || sep == "profile:") { + is >> profile; + } else if (sep == "print_debug_string" || sep == "print_debug_string:") { + is >> print_debug_string; + } else if (sep == "input" || sep == "input:") { + OpInputConfig input_config(is); + inputs.push_back(input_config); + } else if (sep == "attrs" || sep == "attrs:") { + ParseAttrs(is); + } else { + if (sep != kEndSeparator) { + return false; + } + } + } + } else { + return false; + } + return true; +} + +bool OpTesterConfig::ParseAttrs(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (true) { + std::string key; + is >> key; + if (key == kEndSeparator) { + break; + } + + std::string value; + is >> value; + EraseEndSep(&key, ":"); + EraseEndSep(&value); + VLOG(4) << "attrs: " << key << ", " << value; + + attrs[key] = value; + } + } + return true; +} + +const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) { + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].name == name) { + return &inputs[i]; + } + } + return nullptr; +} + } // namespace benchmark } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index f2a4201fd960d..e4063436c0336 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -30,37 +30,48 @@ set(CINN_OP_DEPS register_operators(DEPS ${CINN_OP_DEPS}) if(WITH_TESTING) - cc_test( + cc_test_old( cinn_launch_context_test - SRCS cinn_launch_context_test.cc - DEPS ddim - lod_tensor - scope - proto_desc - graph - cinn_launch_context - cinn_instruction_run_op - cinn) + SRCS + cinn_launch_context_test.cc + DEPS + ddim + lod_tensor + scope + proto_desc + graph + cinn_launch_context + cinn_instruction_run_op + cinn) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") set(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda" ) - cc_test( + cc_test_old( cinn_launch_op_test - SRCS cinn_launch_op_test.cc - DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op - elementwise_add_op gflags) + SRCS + cinn_launch_op_test.cc + DEPS + cinn_compiler + cinn_launch_op + cinn_instruction_run_op + elementwise_add_op + gflags) set_tests_properties( cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") - cc_test( + cc_test_old( cinn_instruction_run_op_test - SRCS cinn_instruction_run_op_test.cc - DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op - elementwise_add_op) + SRCS + cinn_instruction_run_op_test.cc + DEPS + cinn_compiler + cinn_launch_op + cinn_instruction_run_op + elementwise_add_op) set_tests_properties( cinn_instruction_run_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc index e23deb348c985..d0b20a2f08066 100644 --- a/paddle/fluid/operators/copy_cross_scope_test.cc +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/copy_cross_scope_op.cc" #include "paddle/fluid/string/printf.h" #define Conn(x, y) x##y diff --git a/paddle/fluid/operators/lite/CMakeLists.txt b/paddle/fluid/operators/lite/CMakeLists.txt index 3955c6e322b0e..c272e8bc94c50 100644 --- a/paddle/fluid/operators/lite/CMakeLists.txt +++ b/paddle/fluid/operators/lite/CMakeLists.txt @@ -1,5 +1,3 @@ op_library(lite_engine_op DEPS lite_engine lite_tensor_utils) -cc_test( - test_lite_engine_op - SRCS lite_engine_op_test.cc - DEPS lite_engine_op analysis) +cc_test_old(test_lite_engine_op SRCS lite_engine_op_test.cc DEPS lite_engine_op + analysis) diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake index 4c94bc3f3ad57..4f88b091c02ae 100644 --- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake +++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake @@ -1,14 +1,16 @@ -cc_test( +cc_test_old( test_mkldnn_op_nhwc - SRCS mkldnn/test_mkldnn_op_nhwc.cc - DEPS op_registry - pool_op - shape_op - crop_op - activation_op - pooling - transpose_op - scope - device_context - enforce - executor) + SRCS + mkldnn/test_mkldnn_op_nhwc.cc + DEPS + op_registry + pool_op + shape_op + crop_op + activation_op + pooling + transpose_op + scope + device_context + enforce + executor) diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt index f63d43a9314b4..85741b2f5f3a8 100644 --- a/paddle/fluid/operators/prim_ops/CMakeLists.txt +++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt @@ -42,7 +42,4 @@ set(PRIM_OP_SRCS rsqrt_p_op.cc uniform_random_p_op.cc) -cc_test( - prim_op_test - SRCS prim_op_test.cc ${PRIM_OP_SRCS} - DEPS op_registry) +cc_test_old(prim_op_test SRCS prim_op_test.cc ${PRIM_OP_SRCS} DEPS op_registry) diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt index 04407ea117d17..1c4771c7b4d07 100755 --- a/paddle/fluid/operators/pscore/CMakeLists.txt +++ b/paddle/fluid/operators/pscore/CMakeLists.txt @@ -76,61 +76,69 @@ set(OPERATOR_DEPS set_source_files_properties( heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( heter_server_test - SRCS heter_server_test.cc - DEPS ${RPC_DEPS} - ${DISTRIBUTE_DEPS} - executor - scope - proto_desc - scale_op - eigen_function) + SRCS + heter_server_test.cc + DEPS + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + executor + scope + proto_desc + scale_op + eigen_function) set_source_files_properties( send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( send_and_recv_cpu_test - SRCS send_and_recv_op_cpu_test.cc - DEPS executor - scope - proto_desc - scale_op - send_and_recv_op - ${RPC_DEPS} - ${DISTRIBUTE_DEPS} - eigen_function) + SRCS + send_and_recv_op_cpu_test.cc + DEPS + executor + scope + proto_desc + scale_op + send_and_recv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) set_source_files_properties( send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( send_and_recv_gpu_test - SRCS send_and_recv_op_gpu_test.cc - DEPS executor - scope - proto_desc - scale_op - send_and_recv_op - ${RPC_DEPS} - ${DISTRIBUTE_DEPS} - eigen_function) + SRCS + send_and_recv_op_gpu_test.cc + DEPS + executor + scope + proto_desc + scale_op + send_and_recv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) set_source_files_properties( heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test( +cc_test_old( heter_listen_and_server_test - SRCS heter_listen_and_server_test.cc - DEPS executor - scope - proto_desc - scale_op - heter_listen_and_serv_op - ${RPC_DEPS} - ${DISTRIBUTE_DEPS} - eigen_function) + SRCS + heter_listen_and_server_test.cc + DEPS + executor + scope + proto_desc + scale_op + heter_listen_and_serv_op + ${RPC_DEPS} + ${DISTRIBUTE_DEPS} + eigen_function) #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index a5bc1fff54636..e03b2e333caa0 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -617,11 +617,20 @@ if(WITH_PYTHON) if(WIN32) set(SHARD_LIB_NAME libpaddle) endif() + set_property(GLOBAL PROPERTY PADDLE_LIB_NAME ${SHARD_LIB_NAME}) cc_library( ${SHARD_LIB_NAME} SHARED SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + # TODO(zhiqiu): some symbols not exported even setting the following + # property. Need to find a better way. + + # if(WIN32) + # set_property(TARGET ${SHARD_LIB_NAME} + # PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON) + # endif() + if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_dependencies(${SHARD_LIB_NAME} legacy_eager_codegen) add_dependencies(${SHARD_LIB_NAME} eager_legacy_op_function_generator_cmd) diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 4a0c99f987812..3dc2920537568 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -24,10 +24,15 @@ cc_test( test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos) -cc_test( +cc_test_old( test_meta_fn_utils - SRCS test_meta_fn_utils.cc - DEPS dense_tensor wrapped_infermeta infermeta infermeta_utils) + SRCS + test_meta_fn_utils.cc + DEPS + dense_tensor + wrapped_infermeta + infermeta + infermeta_utils) cc_test( test_ddim diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc index 204b7f359a6b4..de97634ed61db 100644 --- a/paddle/phi/tests/ops/test_op_signature.cc +++ b/paddle/phi/tests/ops/test_op_signature.cc @@ -33,7 +33,7 @@ TEST(ARG_MAP, fill_constant) { {"ShapeTensor", "ValueTensor"}, {}, {}, {}, {"Out"}); auto signature1 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case1); - ASSERT_EQ(signature1.name, "full_sr"); + EXPECT_STREQ(signature1.name, "full_sr"); TestArgumentMappingContext arg_case2( {"ShapeTensor"}, @@ -43,7 +43,7 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature2 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case2); - ASSERT_EQ(signature2.name, "full_sr"); + EXPECT_STREQ(signature2.name, "full_sr"); TestArgumentMappingContext arg_case3( {"ShapeTensor"}, @@ -53,13 +53,13 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature3 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case3); - ASSERT_EQ(signature3.name, "full_sr"); + EXPECT_STREQ(signature3.name, "full_sr"); TestArgumentMappingContext arg_case4( {"ShapeTensorList", "ValueTensor"}, {}, {}, {}, {"Out"}); auto signature4 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case4); - ASSERT_EQ(signature4.name, "full_sr"); + EXPECT_STREQ(signature4.name, "full_sr"); TestArgumentMappingContext arg_case5( {"ShapeTensorList"}, @@ -69,7 +69,7 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature5 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case5); - ASSERT_EQ(signature5.name, "full_sr"); + EXPECT_STREQ(signature5.name, "full_sr"); TestArgumentMappingContext arg_case6( {"ShapeTensorList"}, @@ -79,7 +79,7 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature6 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case6); - ASSERT_EQ(signature6.name, "full_sr"); + EXPECT_STREQ(signature6.name, "full_sr"); TestArgumentMappingContext arg_case7( {"ValueTensor"}, @@ -89,7 +89,7 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature7 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case7); - ASSERT_EQ(signature7.name, "full_sr"); + EXPECT_STREQ(signature7.name, "full_sr"); TestArgumentMappingContext arg_case8( {}, @@ -101,7 +101,7 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature8 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case8); - ASSERT_EQ(signature8.name, "full_sr"); + EXPECT_STREQ(signature8.name, "full_sr"); TestArgumentMappingContext arg_case9( {}, @@ -112,7 +112,7 @@ TEST(ARG_MAP, fill_constant) { {"Out"}); auto signature9 = (*OpUtilsMap::Instance().GetArgumentMappingFn( "fill_constant"))(arg_case9); - ASSERT_EQ(signature9.name, "full_sr"); + EXPECT_STREQ(signature9.name, "full_sr"); } TEST(ARG_MAP, set_value) { @@ -122,7 +122,7 @@ TEST(ARG_MAP, set_value) { {{"fp32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case) .name, "set_value"); @@ -133,7 +133,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case1) .name, "set_value"); @@ -144,7 +144,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case2) .name, "set_value"); @@ -155,7 +155,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case3) .name, "set_value"); @@ -166,7 +166,7 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case4) .name, "set_value"); @@ -177,7 +177,7 @@ TEST(ARG_MAP, set_value) { {}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case5) .name, "set_value_with_tensor"); @@ -188,7 +188,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case6) .name, "set_value"); @@ -199,7 +199,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case7) .name, "set_value"); @@ -210,7 +210,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case8) .name, "set_value"); @@ -221,7 +221,7 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case9) .name, "set_value"); @@ -232,7 +232,7 @@ TEST(ARG_MAP, set_value) { {}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case10) .name, "set_value_with_tensor"); @@ -243,7 +243,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case11) .name, "set_value"); @@ -254,7 +254,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case12) .name, "set_value"); @@ -265,7 +265,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case13) .name, "set_value"); @@ -276,14 +276,14 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case14) .name, "set_value"); TestArgumentMappingContext arg_case15( {"Input", "StartsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case15) .name, "set_value_with_tensor"); @@ -294,7 +294,7 @@ TEST(ARG_MAP, set_value) { {{"fp32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case16) .name, "set_value"); @@ -305,7 +305,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case17) .name, "set_value"); @@ -316,7 +316,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case18) .name, "set_value"); @@ -327,7 +327,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case19) .name, "set_value"); @@ -338,7 +338,7 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case20) .name, "set_value"); @@ -349,7 +349,7 @@ TEST(ARG_MAP, set_value) { {}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case21) .name, "set_value_with_tensor"); @@ -360,7 +360,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case22) .name, "set_value"); @@ -371,7 +371,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case23) .name, "set_value"); @@ -382,7 +382,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case24) .name, "set_value"); @@ -393,14 +393,14 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case25) .name, "set_value"); TestArgumentMappingContext arg_case26( {"Input", "EndsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case26) .name, "set_value_with_tensor"); @@ -411,7 +411,7 @@ TEST(ARG_MAP, set_value) { {{"fp32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case27) .name, "set_value"); @@ -422,7 +422,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case28) .name, "set_value"); @@ -433,7 +433,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case29) .name, "set_value"); @@ -444,7 +444,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case30) .name, "set_value"); @@ -455,14 +455,14 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case31) .name, "set_value"); TestArgumentMappingContext arg_case32( {"Input", "StepsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case32) .name, "set_value_with_tensor"); @@ -473,7 +473,7 @@ TEST(ARG_MAP, set_value) { {{"fp32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case33) .name, "set_value"); @@ -484,7 +484,7 @@ TEST(ARG_MAP, set_value) { {{"fp64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case34) .name, "set_value"); @@ -495,7 +495,7 @@ TEST(ARG_MAP, set_value) { {{"int32_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case35) .name, "set_value"); @@ -506,7 +506,7 @@ TEST(ARG_MAP, set_value) { {{"int64_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case36) .name, "set_value"); @@ -517,7 +517,7 @@ TEST(ARG_MAP, set_value) { {{"bool_values", paddle::any{std::vector{1}}}}, {"Out"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value"))(arg_case37) .name, "set_value"); @@ -530,7 +530,7 @@ TEST(ARG_MAP, set_value_grad) { {}, {"Input@GRAD", "ValueTensor@GRAD"}, {}); - ASSERT_EQ( + EXPECT_STREQ( (*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))(arg_case) .name, "set_value_grad"); @@ -541,20 +541,20 @@ TEST(ARG_MAP, set_value_grad) { {}, {"Input@GRAD", "ValueTensor@GRAD"}, {}); - ASSERT_EQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( - arg_case1) - .name, - "set_value_grad"); + EXPECT_STREQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( + arg_case1) + .name, + "set_value_grad"); TestArgumentMappingContext arg_case2({"Out@GRAD", "StartsTensorList"}, {}, {}, {"Input@GRAD", "ValueTensor@GRAD"}, {}); - ASSERT_EQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( - arg_case2) - .name, - "set_value_grad"); + EXPECT_STREQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( + arg_case2) + .name, + "set_value_grad"); TestArgumentMappingContext arg_case3( {"Out@GRAD", "EndsTensorList", "StepsTensorList"}, @@ -562,30 +562,30 @@ TEST(ARG_MAP, set_value_grad) { {}, {"Input@GRAD", "ValueTensor@GRAD"}, {}); - ASSERT_EQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( - arg_case3) - .name, - "set_value_grad"); + EXPECT_STREQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( + arg_case3) + .name, + "set_value_grad"); TestArgumentMappingContext arg_case4({"Out@GRAD", "EndsTensorList"}, {}, {}, {"Input@GRAD", "ValueTensor@GRAD"}, {}); - ASSERT_EQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( - arg_case4) - .name, - "set_value_grad"); + EXPECT_STREQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( + arg_case4) + .name, + "set_value_grad"); TestArgumentMappingContext arg_case5({"Out@GRAD", "StepsTensorList"}, {}, {}, {"Input@GRAD", "ValueTensor@GRAD"}, {}); - ASSERT_EQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( - arg_case5) - .name, - "set_value_grad"); + EXPECT_STREQ((*OpUtilsMap::Instance().GetArgumentMappingFn("set_value_grad"))( + arg_case5) + .name, + "set_value_grad"); } TEST(ARG_MAP, allclose) { @@ -598,8 +598,8 @@ TEST(ARG_MAP, allclose) { {}); auto signature1 = (*OpUtilsMap::Instance().GetArgumentMappingFn("allclose"))(arg_case1); - ASSERT_EQ(signature1.name, "allclose"); - ASSERT_EQ(signature1.attr_names[0], "Rtol"); + EXPECT_STREQ(signature1.name, "allclose"); + EXPECT_STREQ(signature1.attr_names[0], "Rtol"); TestArgumentMappingContext arg_case2( {"Input", "Other", "Atol"}, @@ -610,26 +610,26 @@ TEST(ARG_MAP, allclose) { {}); auto signature2 = (*OpUtilsMap::Instance().GetArgumentMappingFn("allclose"))(arg_case2); - ASSERT_EQ(signature2.name, "allclose"); - ASSERT_EQ(signature2.attr_names[1], "Atol"); + EXPECT_STREQ(signature2.name, "allclose"); + EXPECT_STREQ(signature2.attr_names[1], "Atol"); } TEST(ARG_MAP, reshape) { TestArgumentMappingContext arg_case1({"X", "ShapeTensor"}, {}, {}, {"Out"}); auto signature1 = (*OpUtilsMap::Instance().GetArgumentMappingFn("reshape2"))(arg_case1); - ASSERT_EQ(signature1.name, "reshape"); + EXPECT_STREQ(signature1.name, "reshape"); TestArgumentMappingContext arg_case2({"X", "Shape"}, {}, {}, {"Out"}); auto signature2 = (*OpUtilsMap::Instance().GetArgumentMappingFn("reshape2"))(arg_case2); - ASSERT_EQ(signature2.name, "reshape"); + EXPECT_STREQ(signature2.name, "reshape"); TestArgumentMappingContext arg_case3( {"X"}, {}, {{"shape", paddle::any(std::vector({1, 2}))}}, {"Out"}); auto signature3 = (*OpUtilsMap::Instance().GetArgumentMappingFn("reshape2"))(arg_case3); - ASSERT_EQ(signature3.name, "reshape"); + EXPECT_STREQ(signature3.name, "reshape"); } } // namespace tests diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 7b02aef22e8dc..2c34547319378 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -18,4 +18,12 @@ if(WITH_TESTING) paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps}) + + cc_library( + paddle_gtest_main_new + SRCS paddle_gtest_main.cc + DEPS gtest xxhash framework_proto eigen3 dlpack) + if(WITH_MKLDNN) + add_dependencies(paddle_gtest_main_new mkldnn) + endif() endif() From 8775545a7dd4e54d46ad5ba48db6d31e0aea0dd2 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 27 Oct 2022 19:06:17 +0800 Subject: [PATCH 09/91] support prepare_data for selected_rows in c++ api (#47380) --- paddle/phi/api/lib/data_transform.cc | 46 +++++++++++++++++++++-- paddle/phi/api/lib/data_transform.h | 11 ++++++ paddle/phi/api/yaml/generator/api_base.py | 2 +- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 048a24ff5e312..c6a773ebe5fc7 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -169,10 +169,6 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, VLOG(3) << "DeviceTransform in, src_place " << tensor.place() << " dst_place: " << dst_place; - DefaultAllocator alloc(dst_place); - phi::DenseTensor out(&alloc, - {tensor.dtype(), tensor.dims(), tensor.layout()}); - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& pool = paddle::platform::DeviceContextPool::Instance(); // NOTE(yy): TransDataPlace should wait for computation of input. @@ -191,6 +187,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, // the transforming is from CPU to GPU and the number of elements is little. // But the embarrassment is that this solution this solution makes training // slower. + phi::DenseTensor out; paddle::framework::TensorCopySync(tensor, dst_place, &out); return out; } @@ -305,6 +302,47 @@ paddle::optional> PrepareData( return paddle::none; } +std::shared_ptr PrepareDataForSelectedRows( + const Tensor& input, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag) { + const auto& tensor_in = input.impl(); + if (tensor_in) { + phi::SelectedRows& selected_rows = + *static_cast(tensor_in.get()); + if (!transform_flag.NeedTransform() || !selected_rows.initialized() || + (!NeedTransformPlace( + selected_rows.place(), target_args_def.backend, transform_flag))) { + return std::static_pointer_cast(tensor_in); + } + + auto dense_out = TransDataPlace( + selected_rows.value(), phi::TransToPhiPlace(target_args_def.backend)); + if (selected_rows.place().GetType() == AllocationType::GPUPINNED) { + selected_rows.mutable_value()->ShareBufferWith(dense_out); + return std::static_pointer_cast(tensor_in); + } + + auto out_new = std::make_shared(selected_rows.rows(), + selected_rows.height()); + *out_new->mutable_value() = dense_out; + return out_new; + } + PADDLE_THROW(phi::errors::InvalidArgument( + "The impl() of input tensor is nullptr, it doesn't support for " + "selected_rows data transform now.")); +} + +paddle::optional PrepareDataForSelectedRows( + const paddle::optional& input, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag) { + if (input) { + return *PrepareDataForSelectedRows(*input, target_args_def, transform_flag); + } + return paddle::none; +} + void TransDataBackend(const phi::DenseTensor* tensor, Backend target_backend, phi::DenseTensor* out) { diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h index 7695855e30b56..7a97bb01f61fa 100644 --- a/paddle/phi/api/lib/data_transform.h +++ b/paddle/phi/api/lib/data_transform.h @@ -82,6 +82,17 @@ paddle::optional> PrepareData( const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag); +// Only support transfering place for SelectedRows +std::shared_ptr PrepareDataForSelectedRows( + const Tensor& input, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag); + +paddle::optional PrepareDataForSelectedRows( + const paddle::optional& input, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag); + void TransDataBackend(const phi::DenseTensor* tensor, Backend target_backend, phi::DenseTensor* out); diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py index e67023d2faf71..53b950b63f052 100644 --- a/paddle/phi/api/yaml/generator/api_base.py +++ b/paddle/phi/api/yaml/generator/api_base.py @@ -715,7 +715,7 @@ def gene_selected_rows_input( input_tensor_code = ( input_tensor_code + f""" -{code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name}); +{code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareDataForSelectedRows({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag}); """ ) return input_tensor_code From b68c4a1e6dc442a248f8f51650249f0559dbd5bd Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Thu, 27 Oct 2022 19:18:04 +0800 Subject: [PATCH 10/91] [Dy2St]Fix abnormal growth of memory in train mode and no_grad for Dy2St (#47398) Fix abnormal growth of memory in train mode and no_grad for Dy2St --- paddle/fluid/eager/to_static/run_program_op_node.h | 4 ++-- python/paddle/fluid/dygraph/io.py | 13 ------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 4d6e8d93107c3..db3db215e2bca 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -403,7 +403,7 @@ inline void RunProgramAPI( VLOG(3) << paddle::framework::GenScopeTreeDebugInfo( out_scope_vec->front()); - if (is_test) { + if (is_test || !egr::Controller::Instance().HasGrad()) { VLOG(4) << "is test, set this scope can reused"; global_inner_scope->SetCanReuesd(true); details::GcScope(global_inner_scope); @@ -481,7 +481,7 @@ inline void RunProgramAPI( // Debug info: scope info when run end VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); // Step 5. Drop all children scopes while testing. - if (is_test) { + if (is_test || !egr::Controller::Instance().HasGrad()) { out_scope_vec->front()->DropKids(); } VLOG(2) << "The number of sub scopes after forward: " diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index c5eb445cc610d..3d85a223f6996 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -1058,8 +1058,6 @@ def _run_dygraph(instance, input, program_holder): continue persistable_var._set_grad_type(grad_var.type()) - drop_scope_if_no_grad(instance, tmp_scope_vec) - # 3. prepare output, keep same form with inputs outs = output_vars if len(output_vars) == 1: @@ -1067,17 +1065,6 @@ def _run_dygraph(instance, input, program_holder): return outs -def drop_scope_if_no_grad(instance, scope_vec): - tracer = framework._dygraph_tracer() - scope = ( - scope_vec.value().get_scope() - if isinstance(scope_vec, (core.VarBase)) - else scope_vec[0] - ) - if (not instance._is_test) and (not tracer._has_grad): - scope.drop_kids() - - def _run_static_graph(input, program_holder, trace_program): main_program = framework.default_main_program() param_var_names = _get_persistable_var_names(trace_program) From 5429d145978118da922d5c0e610a7761ee450ca5 Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Thu, 27 Oct 2022 19:21:33 +0800 Subject: [PATCH 11/91] update dygraph PTQ export_model api (#47284) --- .../slim/quantization/imperative/ptq.py | 32 +++++++++++++------ .../quantization/imperative/ptq_registry.py | 1 + 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py index 9c028736d6826..22c30b3166cee 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py @@ -121,7 +121,7 @@ def save_quantized_model(self, model, path, input_spec=None, **config): InputSpec or example Tensor. If None, all input variables of the original Layer's forward method would be the inputs of the saved model. Default None. - **configs (dict, optional): Other save configuration options for + **config (dict, optional): Other save configuration options for compatibility. We do not recommend using these configurations, they may be removed in the future. If not necessary, DO NOT use them. Default None. @@ -140,11 +140,15 @@ def save_quantized_model(self, model, path, input_spec=None, **config): assert isinstance( model, paddle.nn.Layer ), "The model must be the instance of paddle.nn.Layer." + is_postprocess = config.get('postprocess', False) + config.pop('postprocess', None) # Convert and save dygraph quantized model self._convert(model) paddle.jit.save(layer=model, path=path, input_spec=input_spec, **config) + if not is_postprocess: + return # Load inference program is_dynamic_mode = False @@ -272,10 +276,16 @@ def _save_output_thresholds(self, sub_layer, quant_config): output_names = layer_info.output_names output_thresholds = quant_config.out_act_quantizer.thresholds assert len(output_names) == 1 - assert len(output_thresholds) == 1 - save_name = output_names[0] + str(0) + "_threshold" - sub_layer._set_op_attrs({save_name: output_thresholds[0]}) - sub_layer._set_op_attrs({"out_threshold": output_thresholds[0]}) + if len(output_thresholds) == 1: + save_name = output_names[0] + str(0) + "_threshold" + sub_layer._set_op_attrs({save_name: output_thresholds[0]}) + sub_layer._set_op_attrs({"out_threshold": output_thresholds[0]}) + else: + _logger.warning( + "output_thresholds shape of {} need to be 1, but received {}".format( + output_names[0], len(output_thresholds) + ) + ) def _wrap_simulated_layers(self, model): """ @@ -326,11 +336,13 @@ def _wrap_simulated_layers(self, model): # save the input thresholds assert hasattr(quant_layer, "_fake_quant_input") assert hasattr(quant_layer._fake_quant_input, "_scale") - assert len(in_act_quantizer.thresholds) == 1 - input_threshold = np.array( - [in_act_quantizer.thresholds[0]], dtype=np.float32 - ) - quant_layer._fake_quant_input._scale.set_value(input_threshold) + if len(in_act_quantizer.thresholds) == 1: + input_threshold = np.array( + [in_act_quantizer.thresholds[0]], dtype=np.float32 + ) + quant_layer._fake_quant_input._scale.set_value( + input_threshold + ) assert hasattr(quant_layer, "_fake_quant_weight") assert hasattr(quant_layer._fake_quant_weight, "_scale") diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py index 5bc7fc0c6b351..e7b6a243abece 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py @@ -41,6 +41,7 @@ def __init__(self, layer, input_names, weight_names, output_names): LayerInfo(paddle.nn.ReLU, ['X'], [], ['Out']), LayerInfo(paddle.nn.ReLU6, ['X'], [], ['Out']), LayerInfo(paddle.nn.Hardswish, ['X'], [], ['Out']), + LayerInfo(paddle.nn.Swish, ['X'], [], ['Out']), LayerInfo(paddle.nn.Sigmoid, ['X'], [], ['Out']), LayerInfo(paddle.nn.Softmax, ['X'], [], ['Out']), LayerInfo(paddle.nn.Tanh, ['X'], [], ['Out']), From 0972d6ac78e8e7696c256da8dc961b1d7ed8fe93 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Thu, 27 Oct 2022 20:56:40 +0800 Subject: [PATCH 12/91] [Paddle Inference] improve convert_to_mixed_precision (#47333) --- .../passes/convert_to_mixed_precision.cc | 440 +++++++++--------- .../passes/convert_to_mixed_precision.h | 4 +- 2 files changed, 223 insertions(+), 221 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index 219ce6d17e053..9d0e6ecf49aed 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -42,13 +42,13 @@ #include "paddle/phi/common/place.h" #include "paddle/phi/core/tensor_meta.h" -using namespace paddle::framework; // NOLINT - namespace paddle { namespace inference { namespace analysis { namespace { +using VarType = framework::proto::VarType; + bool PhiKernelSupportPrecision( const std::string& op_type, phi::Backend backend, @@ -73,13 +73,14 @@ bool GpuKernelSupportPrecision( phi_op_type, phi::Backend::GPUDNN, data_type, layout); if (!res) { - auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); if (it != all_kernels.end()) { for (auto& kern_pair : it->second) { if (platform::is_gpu_place(kern_pair.first.place_) && - kern_pair.first.data_type_ == framework::proto::VarType::FP16) { + kern_pair.first.data_type_ == VarType::FP16) { res = true; + break; } } } @@ -88,6 +89,8 @@ bool GpuKernelSupportPrecision( } class ConvertToMixedPrecisionPass { + using BlockID = size_t; + public: explicit ConvertToMixedPrecisionPass( const std::string& model_file, @@ -97,7 +100,7 @@ class ConvertToMixedPrecisionPass { phi::DataType mixed_precision, phi::Backend backend, bool keep_io_types, - std::unordered_set black_list) + const std::unordered_set& black_list) : model_file_(model_file), params_file_(params_file), mixed_model_file_(mixed_model_file), @@ -107,45 +110,40 @@ class ConvertToMixedPrecisionPass { keep_io_types_(keep_io_types), black_list_(black_list), place_(paddle::CPUPlace()), - executor_(place_) { - black_list_.insert("assign"); - black_list_.insert("fill_constant"); - black_list_.insert("assign_value"); - black_list_.insert("eye"); - black_list_.insert("fill_any_like"); - black_list_.insert("fill_constant_batch_size_like"); - } + executor_(place_) {} + void Run(); private: void LoadAndPrepare(); - inline bool NodeVarHasDtype(framework::ir::Node* node); + inline bool VarNodeHasDtype(framework::ir::Node* node); void ConvertAllFp64ToFp32(framework::ir::Graph* graph); void FixCastAttr(framework::ir::Graph* graph); void SaveMixedModel(); - void ConvertTensorDtype(int block_idx); + void ConvertTensorDtype(BlockID block_idx); void ProcessInputNode(bool support_precision, - ir::Node* in_node, - ir::Node* op_node, + framework::ir::Node* in_node, + framework::ir::Node* op_node, int* suffix, framework::BlockDesc* block_desc, - framework::proto::VarType::Type to_type, - int block_idx); + VarType::Type to_type, + BlockID block_idx); - void ProcessOutputNode(int block_idx, - ir::Node* var_node, - framework::proto::VarType::Type to_type); - inline bool IsFloatVarType(framework::proto::VarType::Type type); + void ProcessOutputNode(BlockID block_idx, + framework::ir::Node* var_node, + VarType::Type to_type); + inline bool IsFloatVarType(VarType::Type type); - bool OutShouldNotConvert(ir::Node* var_node); + bool OutShouldNotConvert(framework::ir::Node* var_node); // Just process special cases for weights conversion. - bool WeightsShouldNotConvert(ir::Node* var_node); + bool WeightsShouldNotConvert(framework::ir::Node* var_node); // To support multi block, we need to consider a lot of special cases. // Return Node* which first appers in block. - framework::ir::Node* GetRealNode(int block_idx, framework::ir::Node* node); + framework::ir::Node* GetRealVarNode(BlockID block_idx, + framework::ir::Node* node); void FindVarsInMultiBlock(); - inline bool VarIsMultiPrecisionOpsOut(int block_idx, + inline bool VarIsMultiPrecisionOpsOut(BlockID block_idx, framework::ir::Node* op_node); private: @@ -167,11 +165,10 @@ class ConvertToMixedPrecisionPass { framework::Scope scope_; std::unordered_map cast_map_; - std::unordered_map> - vars_in_multi_block_map_; - std::vector>> - vars_appear_multi_in_one_block_; + std::unordered_map> + vars_in_multi_block_with_pair_; + std::unordered_map> + vars_in_multi_block_with_ops_; int suffix_{0}; std::unique_ptr program_desc_{nullptr}; @@ -179,91 +176,84 @@ class ConvertToMixedPrecisionPass { std::vector graphes_; }; -framework::ir::Node* ConvertToMixedPrecisionPass::GetRealNode( - int block_idx, framework::ir::Node* node) { - if (vars_in_multi_block_map_.count(node->Name())) { - int var_origin_block_id = vars_in_multi_block_map_.at(node->Name()).second; - if (block_idx != var_origin_block_id) { - auto graph = graphes_[var_origin_block_id]; - for (auto nd : graph->Nodes()) { - if (nd->Name() == node->Name()) { - return nd; +framework::ir::Node* ConvertToMixedPrecisionPass::GetRealVarNode( + BlockID block_idx, framework::ir::Node* var_node) { + CHECK_EQ(var_node->IsVar(), true); + + if (vars_in_multi_block_with_pair_.count(var_node->Name())) { + auto origin_blockId = + vars_in_multi_block_with_pair_.at(var_node->Name()).second; + if (block_idx != origin_blockId) { + auto* graph = graphes_[origin_blockId]; + for (auto* node : graph->Nodes()) { + if (node->Name() == var_node->Name()) { + return node; } } } } - return node; + return var_node; } -inline bool ConvertToMixedPrecisionPass::NodeVarHasDtype( - framework::ir::Node* node) { - if (node->IsVar() && - (node->Var()->GetType() == - paddle::framework::proto::VarType::SELECTED_ROWS || - node->Var()->GetType() == - paddle::framework::proto::VarType::LOD_TENSOR || - node->Var()->GetType() == - paddle::framework::proto::VarType::LOD_TENSOR_ARRAY || - node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS || - node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) { - return true; - } - - return false; +inline bool ConvertToMixedPrecisionPass::VarNodeHasDtype( + framework::ir::Node* var_node) { + CHECK_EQ(var_node->IsVar(), true); + auto type = var_node->Var()->GetType(); + return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) || + (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) || + (type == VarType::VOCAB); } // op1(fp32) -> var1, op2(fp16) -> var1 // if and only if op1 and op2 both support fp16, we convert op1 and op2's // precision. inline bool ConvertToMixedPrecisionPass::VarIsMultiPrecisionOpsOut( - int block_idx, framework::ir::Node* op_node) { + BlockID block_idx, framework::ir::Node* op_node) { CHECK_EQ(op_node->IsOp(), true); - bool ret{false}; - - for (auto* out : op_node->outputs) { - auto* real_node = GetRealNode(block_idx, out); - if (!real_node->Var()->Persistable() && - vars_appear_multi_in_one_block_[block_idx].count(out->Name())) { - for (auto op_type : - vars_appear_multi_in_one_block_[block_idx].at(out->Name())) { - if (OpSupportPrecision( + + for (auto* var_node : op_node->outputs) { + if (!var_node->IsVar()) continue; + auto* real_var_node = GetRealVarNode(block_idx, var_node); + if (!real_var_node->Var()->Persistable() && + vars_in_multi_block_with_ops_.count(var_node->Name())) { + for (const auto& op_type : + vars_in_multi_block_with_ops_.at(var_node->Name())) { + if (!OpSupportPrecision( op_type, backend_, mixed_precision_, black_list_)) { - ret = true; - VLOG(2) << out->Name() + VLOG(2) << var_node->Name() << " is multi precision op's out, so we skip convert to fp16"; - break; + return true; } } } - if (ret) break; } - return ret; + return false; } void ConvertToMixedPrecisionPass::ProcessInputNode( bool support_precision, - ir::Node* in_node, - ir::Node* op_node, + framework::ir::Node* in_node, + framework::ir::Node* op_node, int* suffix, framework::BlockDesc* block_desc, - framework::proto::VarType::Type to_type, - int block_idx) { - auto* real_node = GetRealNode(block_idx, in_node); - if (!NodeVarHasDtype(real_node)) return; - auto graph = graphes_[block_idx]; + VarType::Type to_type, + BlockID block_idx) { + if (!in_node->IsVar()) return; + auto* real_node = GetRealVarNode(block_idx, in_node); + if (!VarNodeHasDtype(real_node)) return; + auto* graph = graphes_[block_idx]; bool is_main_block = block_idx == 0; auto* in_var = real_node->Var(); auto in_var_type = in_var->GetDataType(); auto prev_type = in_var_type; - bool is_in_multi_block = vars_in_multi_block_map_.count(in_var->Name()); + bool is_in_multi_block = vars_in_multi_block_with_pair_.count(in_var->Name()); if (!is_main_block && is_in_multi_block) { - in_var_type = vars_in_multi_block_map_.at(in_var->Name()).first; + in_var_type = vars_in_multi_block_with_pair_.at(in_var->Name()).first; } if (support_precision) { - if (in_var->Persistable() && - in_var_type == framework::proto::VarType::FP32) { + if (in_var->Persistable() && in_var_type == VarType::FP32) { if (WeightsShouldNotConvert(in_node)) return; in_var->SetDataType(to_type); in_var_type = to_type; @@ -300,14 +290,13 @@ void ConvertToMixedPrecisionPass::ProcessInputNode( } void ConvertToMixedPrecisionPass::ProcessOutputNode( - int block_idx, - ir::Node* var_node, - framework::proto::VarType::Type to_type) { - auto* real_node = GetRealNode(block_idx, var_node); - if (!NodeVarHasDtype(real_node)) return; + BlockID block_idx, framework::ir::Node* var_node, VarType::Type to_type) { + if (!var_node->IsVar()) return; + auto* real_node = GetRealVarNode(block_idx, var_node); + if (!VarNodeHasDtype(real_node)) return; auto* out_var = real_node->Var(); auto prev_type = out_var->GetDataType(); - if (out_var->GetDataType() == framework::proto::VarType::FP32) { + if (out_var->GetDataType() == VarType::FP32) { if (OutShouldNotConvert(var_node)) return; out_var->SetDataType(to_type); } @@ -316,7 +305,8 @@ void ConvertToMixedPrecisionPass::ProcessOutputNode( } // Just process special cases. -bool ConvertToMixedPrecisionPass::OutShouldNotConvert(ir::Node* var_node) { +bool ConvertToMixedPrecisionPass::OutShouldNotConvert( + framework::ir::Node* var_node) { auto op_node = var_node->inputs[0]; auto* op_desc = op_node->Op(); @@ -343,7 +333,8 @@ bool ConvertToMixedPrecisionPass::OutShouldNotConvert(ir::Node* var_node) { return false; } -bool ConvertToMixedPrecisionPass::WeightsShouldNotConvert(ir::Node* var_node) { +bool ConvertToMixedPrecisionPass::WeightsShouldNotConvert( + framework::ir::Node* var_node) { auto op_nodes = var_node->outputs; for (auto* op_node : op_nodes) { auto* op_desc = op_node->Op(); @@ -391,13 +382,10 @@ bool ConvertToMixedPrecisionPass::WeightsShouldNotConvert(ir::Node* var_node) { return false; } -inline bool ConvertToMixedPrecisionPass::IsFloatVarType( - framework::proto::VarType::Type type) { - if (type == framework::proto::VarType::FP16 || - type == framework::proto::VarType::FP32 || - type == framework::proto::VarType::BF16) - return true; - return false; + +inline bool ConvertToMixedPrecisionPass::IsFloatVarType(VarType::Type type) { + return (type == VarType::FP16) || (type == VarType::FP32) || + (type == VarType::BF16); } void ConvertToMixedPrecisionPass::LoadAndPrepare() { @@ -405,6 +393,10 @@ void ConvertToMixedPrecisionPass::LoadAndPrepare() { inference::Load(&executor_, &scope_, model_file_, params_file_); main_graph_ = std::unique_ptr( new framework::ir::Graph(*program_desc_)); + for (size_t i = 0; i < main_graph_->SubGraphsSize(); ++i) { + auto* graph = main_graph_->GetSubGraph(i); + graphes_.push_back(graph); + } // Remove all control var IrInferCleanGraphPass pass; @@ -412,41 +404,45 @@ void ConvertToMixedPrecisionPass::LoadAndPrepare() { arg.SetMainGraphNotOwned(main_graph_.get()); pass.Run(&arg); - vars_appear_multi_in_one_block_.resize(program_desc_->Size()); FindVarsInMultiBlock(); } void ConvertToMixedPrecisionPass::FindVarsInMultiBlock() { - std::vector> block_var_names_set(program_desc_->Size()); - for (size_t i = 0; i < program_desc_->Size(); ++i) { - for (auto op : program_desc_->Block(i).AllOps()) { - auto in_names = op->InputArgumentNames(); - block_var_names_set[i].insert(in_names.begin(), in_names.end()); - auto out_names = op->OutputArgumentNames(); + std::unordered_set all_var_names_set; + std::vector> block_var_names_set( + program_desc_->Size()); + for (BlockID idx = 0; idx < program_desc_->Size(); ++idx) { + for (auto* op : program_desc_->Block(idx).AllOps()) { + const auto& in_names = op->InputArgumentNames(); + block_var_names_set[idx].insert(in_names.begin(), in_names.end()); + const auto& out_names = op->OutputArgumentNames(); + block_var_names_set[idx].insert(out_names.begin(), out_names.end()); + if (op->HasAttr("sub_block") == false) { - for (auto& n : out_names) { - if (block_var_names_set[i].count(n)) { - vars_appear_multi_in_one_block_[i][n].push_back(op->Type()); + for (const auto& name : out_names) { + if (all_var_names_set.count(name)) { + vars_in_multi_block_with_ops_[name].push_back(op->Type()); } } } - block_var_names_set[i].insert(out_names.begin(), out_names.end()); + all_var_names_set.insert(block_var_names_set[idx].begin(), + block_var_names_set[idx].end()); } } - for (size_t i = 0; i < program_desc_->Size() - 1; ++i) { - for (size_t j = i + 1; j < program_desc_->Size(); ++j) { - std::set vars_in_multi_block; - std::set_intersection( - block_var_names_set[i].begin(), - block_var_names_set[i].end(), - block_var_names_set[j].begin(), - block_var_names_set[j].end(), - std::inserter(vars_in_multi_block, vars_in_multi_block.begin())); - - for (auto name : vars_in_multi_block) { - vars_in_multi_block_map_.emplace( - name, std::make_pair(framework::proto::VarType::FP32, i)); + CHECK_GT(program_desc_->Size(), 0U); + for (BlockID idx = 0; idx < program_desc_->Size() - 1; ++idx) { + for (BlockID jdx = idx + 1; jdx < program_desc_->Size(); ++jdx) { + std::vector vars_in_multi_block; + std::set_intersection(block_var_names_set[idx].begin(), + block_var_names_set[idx].end(), + block_var_names_set[jdx].begin(), + block_var_names_set[jdx].end(), + std::back_inserter(vars_in_multi_block)); + + for (const auto& name : vars_in_multi_block) { + vars_in_multi_block_with_pair_.emplace( + name, std::make_pair(VarType::FP32, idx)); } } } @@ -462,41 +458,34 @@ void ConvertToMixedPrecisionPass::ConvertAllFp64ToFp32( if (op_type == "fill_constant") { if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(framework::proto::VarType::FP64)) - op_node->Op()->SetAttr( - "dtype", static_cast(framework::proto::VarType::FP32)); + static_cast(VarType::FP64)) + op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); } else if (op_type == "assign_value") { if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(framework::proto::VarType::FP64)) - op_node->Op()->SetAttr( - "dtype", static_cast(framework::proto::VarType::FP32)); + static_cast(VarType::FP64)) + op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); } else if (op_type == "eye") { if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(framework::proto::VarType::FP64)) - op_node->Op()->SetAttr( - "dtype", static_cast(framework::proto::VarType::FP32)); + static_cast(VarType::FP64)) + op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); } else if (op_type == "fill_any_like") { if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(framework::proto::VarType::FP64)) - op_node->Op()->SetAttr( - "dtype", static_cast(framework::proto::VarType::FP32)); + static_cast(VarType::FP64)) + op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); } else if (op_type == "cast") { if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) == - static_cast(framework::proto::VarType::FP64)) - op_node->Op()->SetAttr( - "in_dtype", static_cast(framework::proto::VarType::FP32)); + static_cast(VarType::FP64)) + op_node->Op()->SetAttr("in_dtype", static_cast(VarType::FP32)); if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) == - static_cast(framework::proto::VarType::FP64)) - op_node->Op()->SetAttr( - "out_dtype", static_cast(framework::proto::VarType::FP32)); + static_cast(VarType::FP64)) + op_node->Op()->SetAttr("out_dtype", static_cast(VarType::FP32)); } auto inputs = op_node->inputs; for (auto* in_node : inputs) { auto* in_var = in_node->Var(); - if (!in_var->Persistable() && - in_var->GetDataType() == framework::proto::VarType::FP64) { - in_var->SetDataType(framework::proto::VarType::FP32); + if (!in_var->Persistable() && in_var->GetDataType() == VarType::FP64) { + in_var->SetDataType(VarType::FP32); } } } @@ -505,9 +494,8 @@ void ConvertToMixedPrecisionPass::ConvertAllFp64ToFp32( void ConvertToMixedPrecisionPass::Run() { LoadAndPrepare(); - for (size_t i = 0; i < main_graph_->SubGraphsSize(); ++i) { - auto graph = main_graph_->GetSubGraph(i); - graphes_.push_back(graph); + for (size_t i = 0; i < graphes_.size(); ++i) { + auto* graph = graphes_[i]; VLOG(2) << " -------- handle subgraph " << i << ", has " << graph->Nodes().size() << " nodes --------"; @@ -518,19 +506,19 @@ void ConvertToMixedPrecisionPass::Run() { // A trick PatchForStrangeOp(); - CHECK_EQ(ir::VarDescIsConsistency(*graph), true); + CHECK_EQ(framework::ir::VarDescIsConsistency(*graph), true); } SaveMixedModel(); } -void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { - auto graph = graphes_[block_idx]; - framework::proto::VarType::Type to_type; +void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { + auto* graph = graphes_[block_idx]; + VarType::Type to_type; if (mixed_precision_ == phi::DataType::FLOAT16) { - to_type = framework::proto::VarType::FP16; + to_type = VarType::FP16; } else if (mixed_precision_ == phi::DataType::BFLOAT16) { - to_type = framework::proto::VarType::BF16; + to_type = VarType::BF16; } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( "mixed_precision currently not supported dtype %d, we now only " @@ -551,8 +539,7 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { // 1. set input dtype. if (op_type == "feed") { auto feed_var = op_node->outputs[0]->Var(); - if (!keep_io_types_ && - feed_var->GetDataType() == framework::proto::VarType::FP32) { + if (!keep_io_types_ && feed_var->GetDataType() == VarType::FP32) { feed_var->SetDataType(to_type); } } else if (op_type == "fetch") { @@ -568,15 +555,17 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { // same name. std::unordered_map in_name_to_node; for (auto* in : op_node->inputs) { - auto* real_node = GetRealNode(block_idx, in); - if (NodeVarHasDtype(real_node)) { + if (!in->IsVar()) continue; + auto* real_node = GetRealVarNode(block_idx, in); + if (VarNodeHasDtype(real_node)) { in_name_to_node[in->Name()] = in; } } - for (auto out : op_node->outputs) { - auto* real_node = GetRealNode(block_idx, out); - if (NodeVarHasDtype(real_node)) { + for (auto* out : op_node->outputs) { + if (!out->IsVar()) continue; + auto* real_node = GetRealVarNode(block_idx, out); + if (VarNodeHasDtype(real_node)) { if (in_name_to_node.count(out->Name())) real_node->Var()->SetDataType( in_name_to_node[out->Name()]->Var()->GetDataType()); @@ -591,32 +580,46 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { // - add cast op if the input dtype is not fp16/bf16. // - set output dtype. // - // If a var(op's out var) appears multiple times in a block, we should not + // If a var(op's out var) appears multiple times in graph, we should not // convert to fp16. else if (black_list_.count(op_type) == 0 && // NOLINT !VarIsMultiPrecisionOpsOut(block_idx, op_node)) { bool support_precision = OpSupportPrecision(op_type, backend_, mixed_precision_, black_list_); - // if op not has float input, we will not choose the low precision kernel. + // If the op has no input and output of float type, we will not choose the + // low precision kernel. { - bool has_float_input{false}; - for (auto in_node : op_node->inputs) { - auto* real_node = GetRealNode(block_idx, in_node); - if (real_node->Var()->GetDataType() == proto::VarType::FP16 || - real_node->Var()->GetDataType() == proto::VarType::FP32 || - real_node->Var()->GetDataType() == proto::VarType::FP64 || - real_node->Var()->GetDataType() == proto::VarType::BF16) { - has_float_input = true; + bool has_float_input_and_output{false}; + for (auto* in_node : op_node->inputs) { + if (!in_node->IsVar()) continue; + auto* real_node = GetRealVarNode(block_idx, in_node); + if (real_node->Var()->GetDataType() == VarType::FP16 || + real_node->Var()->GetDataType() == VarType::FP32 || + real_node->Var()->GetDataType() == VarType::FP64 || + real_node->Var()->GetDataType() == VarType::BF16) { + has_float_input_and_output = true; break; } } - if (!has_float_input) { + for (auto* out_node : op_node->outputs) { + if (!out_node->IsVar()) continue; + auto* real_node = GetRealVarNode(block_idx, out_node); + if (real_node->Var()->GetDataType() == VarType::FP16 || + real_node->Var()->GetDataType() == VarType::FP32 || + real_node->Var()->GetDataType() == VarType::FP64 || + real_node->Var()->GetDataType() == VarType::BF16) { + has_float_input_and_output = true; + break; + } + } + if (!has_float_input_and_output) { support_precision = false; - VLOG(2) << " op doesn't has float input, just skip."; + VLOG(2) << " op doesn't has float input and output, just skip."; } } - VLOG(2) << " support low precision " << support_precision; + VLOG(2) << "op type: " << op_type + << " support low precision: " << support_precision; if (support_precision) { VLOG(2) << " process input nodes:"; @@ -626,8 +629,8 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { // Just for paddle's terriable case: op's input and output has the same // name. std::unordered_map names_map; - for (auto out_node : op_node->outputs) { - for (auto in_node : op_node->inputs) { + for (auto* out_node : op_node->outputs) { + for (auto* in_node : op_node->inputs) { if (out_node->Name() == in_node->Name()) { names_map[out_node->Name()] = in_node->Name(); } @@ -655,7 +658,7 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { op_node, &suffix_, block_desc, - framework::proto::VarType::FP32, + VarType::FP32, block_idx); } } @@ -665,21 +668,19 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { // - add cast op if the input dtype is not fp32. else { // NOLINT VLOG(3) << "not to run fp16 op_type: " << op_type; - auto ins = op_node->inputs; - for (auto* in_node : ins) { + for (auto* in_node : op_node->inputs) { auto* in_var = in_node->Var(); if (in_var->GetDataType() == to_type) { AddCastOp(graph, in_node, op_node, to_type, - framework::proto::VarType::FP32, + VarType::FP32, &suffix_, block_desc, &cast_map_); VLOG(3) << "-- " << in_node->Name() << "(" << to_type << ") to " - << cast_map_[in_node]->Name() << "(" - << framework::proto::VarType::FP32 << ")"; + << cast_map_[in_node]->Name() << "(" << VarType::FP32 << ")"; } } } @@ -688,31 +689,30 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { // 4. if output_op's dtype is not compatible to output dtype, then just // insert cast. for (auto* node : output_nodes) { - ir::Node* fetch_op{nullptr}; + framework::ir::Node* fetch_op{nullptr}; for (auto* op_node : node->outputs) { if (op_node->IsOp() && op_node->Op()->Type() == "fetch") { fetch_op = op_node; } } CHECK_NOTNULL(fetch_op); - auto var = node->Var(); + auto* var = node->Var(); if (keep_io_types_ && var->GetDataType() == to_type) { // fp16/bf16 -> fp32. AddCastOp(graph, node, fetch_op, to_type, - framework::proto::VarType::FP32, + VarType::FP32, &suffix_, block_desc, &cast_map_); - } else if (!keep_io_types_ && - var->GetDataType() == framework::proto::VarType::FP32) { + } else if (!keep_io_types_ && var->GetDataType() == VarType::FP32) { // fp32 -> fp16/bf16 AddCastOp(graph, node, fetch_op, - framework::proto::VarType::FP32, + VarType::FP32, to_type, &suffix_, block_desc, @@ -720,13 +720,15 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(int block_idx) { } } - for (auto node : graph->Nodes()) { - auto* real_node = GetRealNode(block_idx, node); - if (!NodeVarHasDtype(real_node)) continue; + for (auto* node : graph->Nodes()) { + if (!node->IsVar()) continue; + auto* real_node = GetRealVarNode(block_idx, node); + if (!VarNodeHasDtype(real_node)) continue; - if (vars_in_multi_block_map_.count(real_node->Name()) && - vars_in_multi_block_map_.at(real_node->Name()).second == block_idx) { - vars_in_multi_block_map_.at(real_node->Name()).first = + if (vars_in_multi_block_with_pair_.count(real_node->Name()) && + vars_in_multi_block_with_pair_.at(real_node->Name()).second == + block_idx) { + vars_in_multi_block_with_pair_.at(real_node->Name()).first = real_node->Var()->GetDataType(); } } @@ -757,17 +759,15 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { framework::ProgramDesc mixed_program_desc; framework::ir::GraphToProgram(*main_graph_, &mixed_program_desc); - paddle::CPUPlace place; auto parameters = scope_.LocalVarNames(); std::sort(parameters.begin(), parameters.end()); std::unordered_set weights_should_be_fp32; for (auto* node : main_graph_->Nodes()) { - if (!(node->IsVar())) continue; - if (NodeVarHasDtype(node)) { + if (!node->IsVar()) continue; + if (VarNodeHasDtype(node)) { if (node->Var()->Persistable() && - node->Var()->GetDataType() == - paddle::framework::proto::VarType::FP32) { + node->Var()->GetDataType() == VarType::FP32) { VLOG(2) << "weights keep to fp32: " << node->Name(); weights_should_be_fp32.insert(node->Name()); } @@ -777,26 +777,27 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { #define CONVERT_TENSOR_DTYPE(DTYPE, dtype) \ mixed_tensor.set_type(DTYPE); \ auto* mixed_data = mixed_tensor.mutable_data(platform::CPUPlace()); \ - for (int i = 0; i < t->numel(); i++) { \ - mixed_data[i] = static_cast(data[i]); \ + for (int64_t i = 0; i < origin_tensor->numel(); i++) { \ + mixed_data[i] = static_cast(origin_data[i]); \ } \ - t->clear(); \ - paddle::framework::TensorCopySync(mixed_tensor, place, t) + origin_tensor->clear(); \ + paddle::framework::TensorCopySync( \ + mixed_tensor, platform::CPUPlace(), origin_tensor) for (const auto& param_name : parameters) { + if (weights_should_be_fp32.count(param_name)) continue; auto* var = scope_.FindLocalVar(param_name); if (var->IsType()) { - auto* t = var->GetMutable(); - if (t->dtype() != phi::DataType::FLOAT32) continue; + auto* origin_tensor = var->GetMutable(); + if (origin_tensor->dtype() != phi::DataType::FLOAT32) continue; phi::DenseTensor mixed_tensor; - mixed_tensor.Resize(t->dims()); - auto* data = t->mutable_data(platform::CPUPlace()); - if (mixed_precision_ == phi::DataType::FLOAT16 && - !weights_should_be_fp32.count(param_name)) { + mixed_tensor.Resize(origin_tensor->dims()); + auto* origin_data = + origin_tensor->mutable_data(platform::CPUPlace()); + if (mixed_precision_ == phi::DataType::FLOAT16) { CONVERT_TENSOR_DTYPE(paddle::experimental::DataType::FLOAT16, phi::dtype::float16); - } else if (mixed_precision_ == phi::DataType::BFLOAT16 && - !weights_should_be_fp32.count(param_name)) { + } else if (mixed_precision_ == phi::DataType::BFLOAT16) { CONVERT_TENSOR_DTYPE(paddle::experimental::DataType::BFLOAT16, phi::dtype::bfloat16); } @@ -851,8 +852,8 @@ void AddCastOp( framework::ir::Graph* graph, framework::ir::Node* node, framework::ir::Node* next_op, - framework::proto::VarType::Type from_type, - framework::proto::VarType::Type to_type, + VarType::Type from_type, + VarType::Type to_type, int* suffix, framework::BlockDesc* block_desc, std::unordered_map* map) { @@ -913,14 +914,15 @@ bool OpSupportPrecision(const std::string& op_type, return support_precision; } -void ConvertToMixedPrecision(const std::string& model_file, - const std::string& params_file, - const std::string& mixed_model_file, - const std::string& mixed_params_file, - phi::DataType mixed_precision, - phi::Backend backend, - bool keep_io_types, - std::unordered_set black_list) { +void ConvertToMixedPrecision( + const std::string& model_file, + const std::string& params_file, + const std::string& mixed_model_file, + const std::string& mixed_params_file, + phi::DataType mixed_precision, + phi::Backend backend, + bool keep_io_types, + const std::unordered_set& black_list) { ConvertToMixedPrecisionPass pass(model_file, params_file, mixed_model_file, diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h index 3b763a4420ed0..583512408c586 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h @@ -51,8 +51,8 @@ void ConvertToMixedPrecision(const std::string& model_file, const std::string& mixed_params_file, phi::DataType mixed_precision, phi::Backend backend, - bool keep_io_types = true, - std::unordered_set black_list = {}); + bool keep_io_types, + const std::unordered_set& black_list); } // namespace analysis } // namespace inference From b160d09eeb9453d05575beb88bc82703791ec977 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 27 Oct 2022 21:03:28 +0800 Subject: [PATCH 13/91] [JIT] Add Predictor for JITLayer (#47379) * add predictor_engine * add predictor_engine * fix zero shape * fix lodTensor * fix unittest * fix code style * update CmakeList --- paddle/fluid/inference/analysis/argument.h | 3 + .../analysis/passes/ir_graph_build_pass.cc | 8 +- .../analysis/passes/ir_graph_build_pass.h | 3 +- paddle/fluid/inference/api/analysis_config.cc | 4 + .../fluid/inference/api/analysis_predictor.cc | 37 ++-- .../inference/api/paddle_analysis_config.h | 11 ++ paddle/fluid/inference/io.cc | 25 +-- paddle/fluid/inference/io.h | 3 +- paddle/fluid/jit/CMakeLists.txt | 3 +- paddle/fluid/jit/engine/CMakeLists.txt | 5 + paddle/fluid/jit/engine/predictor_engine.cc | 186 ++++++++++++++++++ paddle/fluid/jit/engine/predictor_engine.h | 50 +++++ paddle/fluid/jit/function_schema.cc | 8 + paddle/fluid/jit/function_schema.h | 5 + paddle/fluid/jit/serializer.cc | 13 +- .../fluid/operators/collective/CMakeLists.txt | 8 +- .../operators/sequence_ops/CMakeLists.txt | 4 + paddle/fluid/platform/flags.cc | 5 +- 18 files changed, 340 insertions(+), 41 deletions(-) create mode 100644 paddle/fluid/jit/engine/predictor_engine.cc create mode 100644 paddle/fluid/jit/engine/predictor_engine.h diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 52d332b2e34eb..d855dc999cab8 100755 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -150,6 +150,9 @@ struct Argument { DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string); DECL_ARGUMENT_FIELD(enable_analysis_optim, EnableAnalysisOptim, bool); + // For JITLayer + DECL_ARGUMENT_FIELD(skip_load_params, SkipLoadParams, bool); + // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); // The overall Scope to work on. diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index cd93238ff2b56..e07eaa64615c8 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -55,7 +55,8 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { argument->model_params_path(), argument->scope_ptr(), place, - argument->model_from_memory_valid() && argument->model_from_memory()); + argument->model_from_memory_valid() && argument->model_from_memory(), + argument->skip_load_params()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -114,10 +115,11 @@ std::unique_ptr IrGraphBuildPass::LoadModel( const std::string ¶ms_path, framework::Scope *scope, const platform::Place &place, - bool model_from_memory) { + bool model_from_memory, + bool skip_load_params) { framework::Executor exe(place); if (!model_from_memory) { - return Load(&exe, scope, program_path, params_path); + return Load(&exe, scope, program_path, params_path, !skip_load_params); } else { return LoadFromMemory(&exe, scope, program_path, params_path); } diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h index 32902ef066730..69047b73ea02a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -43,7 +43,8 @@ class IrGraphBuildPass : public AnalysisPass { const std::string ¶ms_path, framework::Scope *scope, const platform::Place &place, - bool model_from_memory); + bool model_from_memory, + bool skip_load_params); std::string model_binary_str_; }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 2af92c7e1480d..be09976bc4d0e 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -484,6 +484,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(custom_device_type_); CP_MEMBER(custom_device_id_); + // JITLayer relate + CP_MEMBER(apply_optim_); + CP_MEMBER(skip_load_params_); + if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, false, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index f49c9faeb3d19..a78a768a7009d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -168,20 +168,27 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, LOG(ERROR) << "unsupported feed type " << pt.dtype; return false; } - - PADDLE_ENFORCE_NOT_NULL( - input_ptr, - paddle::platform::errors::Fatal( - "Cannot convert to LoDTensor because LoDTensor creation failed.")); - PADDLE_ENFORCE_NOT_NULL( - pt.data.data(), - paddle::platform::errors::InvalidArgument( - "The data contained in the input PaddleTensor is illegal.")); + // NOTE(Aurelius84): Some kernels support zero shape input + // without memory holder, we should skip enforce logic. + bool has_zero_dim = (phi::product(ddim) == 0); + if (has_zero_dim) { + VLOG(3) << "Found zero dim from input with ddim: " << ddim; + PADDLE_ENFORCE_NOT_NULL( + input_ptr, + paddle::platform::errors::Fatal( + "Cannot convert to LoDTensor because LoDTensor creation failed.")); + PADDLE_ENFORCE_NOT_NULL( + pt.data.data(), + paddle::platform::errors::InvalidArgument( + "The data contained in the input PaddleTensor is illegal.")); + } if (platform::is_cpu_place(place)) { // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. - std::memcpy( - static_cast(input_ptr), pt.data.data(), pt.data.length()); + if (input_ptr != nullptr) { + std::memcpy( + static_cast(input_ptr), pt.data.data(), pt.data.length()); + } } else if (platform::is_ipu_place(place)) { #ifdef PADDLE_WITH_IPU std::memcpy( @@ -529,6 +536,11 @@ bool AnalysisPredictor::PrepareProgram( // If the program is passed from external, no need to optimize it, this // logic is used in the clone scenario. inference_program_ = program; + if (config_.apply_optim_) { + VLOG(3) + << "apply_optim is enabled, will call OptimizeInferenceProgram()."; + OptimizeInferenceProgram(); + } } executor_->CreateVariables(*inference_program_, 0, false, sub_scope_); @@ -1065,11 +1077,12 @@ void AnalysisPredictor::PrepareArgument() { false, platform::errors::PreconditionNotMet( "Either model_dir or prog_file should be set.")); - std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } + // For JITLayer + argument_.SetSkipLoadParams(config_.skip_load_params_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); argument_.SetTensorRtUseOSS(config_.trt_use_varseqlen_); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c5a4fd5934caf..5bc50515bf40a 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -965,6 +965,10 @@ struct PD_INFER_DECL AnalysisConfig { void Exp_SetBlackListOpsForMixedModel( const std::unordered_set& black_list); + void SetApplyOptim(bool value) { apply_optim_ = value; } + + void SetSkipLoadParams(bool value) { skip_load_params_ = value; } + protected: // Update the config. void Update(); @@ -1167,6 +1171,13 @@ struct PD_INFER_DECL AnalysisConfig { // fleet exe related DistConfig dist_config_{}; + + // jit engine related + // NOTE(Aureliue84): In case of Predictor in JITLayer, program is from outer + // which means Predictor should apply optimization by calling + // PrepareProgram(). So we add this flag to control the process. + bool apply_optim_{false}; + bool skip_load_params_{false}; }; } // namespace paddle diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index cad5903540b90..253df63763329 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -160,11 +160,11 @@ std::unique_ptr Load(framework::Executor* executor, return main_program; } -std::unique_ptr Load( - framework::Executor* executor, - framework::Scope* scope, - const std::string& prog_filename, - const std::string& param_filename) { +std::unique_ptr Load(framework::Executor* executor, + framework::Scope* scope, + const std::string& prog_filename, + const std::string& param_filename, + bool load_params) { std::string program_desc_str; ReadBinaryFile(prog_filename, &program_desc_str); @@ -175,13 +175,14 @@ std::unique_ptr Load( true, platform::errors::Unavailable("Model version %ld is not supported.", main_program->Version())); - - LoadPersistables(executor, - scope, - *main_program, - "", - param_filename, - false /* model_from_memory */); + if (load_params) { + LoadPersistables(executor, + scope, + *main_program, + "", + param_filename, + false /* model_from_memory */); + } return main_program; } diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index 31ed29e425dd9..36e21f8f36e13 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -42,7 +42,8 @@ std::unique_ptr Load(framework::Executor* executor, std::unique_ptr Load(framework::Executor* executor, framework::Scope* scope, const std::string& prog_filename, - const std::string& param_filename); + const std::string& param_filename, + bool load_params = true); std::unique_ptr LoadFromMemory( framework::Executor* executor, diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index f47de23b0e165..b6db37d82c3af 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -35,7 +35,7 @@ cc_library( jit_function SRCS function.cc DEPS jit_function_utils jit_executor_engine jit_pe_engine - jit_interpreter_engine) + jit_interpreter_engine jit_predictor_engine) cc_library( jit_layer @@ -48,6 +48,7 @@ cc_library( jit_executor_engine jit_pe_engine jit_interpreter_engine + jit_predictor_engine jit_function) if(WITH_TESTING AND NOT WIN32) diff --git a/paddle/fluid/jit/engine/CMakeLists.txt b/paddle/fluid/jit/engine/CMakeLists.txt index 5626e9eb1fc67..b09e818227d76 100644 --- a/paddle/fluid/jit/engine/CMakeLists.txt +++ b/paddle/fluid/jit/engine/CMakeLists.txt @@ -12,3 +12,8 @@ cc_library( jit_interpreter_engine SRCS interpreter_engine.cc DEPS standalone_executor) + +cc_library( + jit_predictor_engine + SRCS predictor_engine.cc + DEPS paddle_inference_api analysis_predictor) diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc new file mode 100644 index 0000000000000..d6bdf42b041a4 --- /dev/null +++ b/paddle/fluid/jit/engine/predictor_engine.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/jit/engine/predictor_engine.h" + +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/paddle_api.h" +#include "paddle/fluid/jit/function_utils.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace jit { + +static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t); +static bool PaddleTensorToDenseTensor(const PaddleTensor &pt, + DenseTensor *t, + const platform::Place &place); + +PredictorEngine::PredictorEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place) + : info_(info), scope_(new framework::Scope()), place_(place) { + utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, scope_.get()); + VLOG(6) << framework::GenScopeTreeDebugInfo(scope_.get()); + + AnalysisConfig config; + config.SetProgFile(info->ProgramFilePath()); + if (platform::is_gpu_place(place_)) { + config.EnableUseGpu(100, place_.GetDeviceId()); + } else if (platform::is_cpu_place(place_)) { + config.DisableGpu(); + } + config.SetSkipLoadParams(true); + config.SetApplyOptim(true); + config.SwitchIrOptim(true); + + predictor_.reset(new AnalysisPredictor(config)); + + predictor_->Init( + scope_, std::make_shared(info_->ProgramDesc())); +} + +std::vector PredictorEngine::operator()( + const std::vector &inputs) { + auto dense_tensors = utils::ToDenseTensors(inputs); + return utils::ToTensors(this->operator()(dense_tensors)); +} + +std::vector PredictorEngine::operator()( + const std::vector &inputs) { + for (auto t : inputs) { + VLOG(1) << "inputs is init: " << t.initialized(); + } + + std::vector pt_inputs; + std::vector pt_outputs; + for (auto &t : inputs) { + auto non_const_t = const_cast(&t); + pt_inputs.emplace_back(DenseTensorToPaddleTensor(non_const_t)); + } + + predictor_->Run(pt_inputs, &pt_outputs); + + std::vector outputs; + for (auto &pt : pt_outputs) { + DenseTensor t; + PaddleTensorToDenseTensor(pt, &t, place_); + outputs.emplace_back(t); + } + + return outputs; +} + +static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) { + PaddleTensor pt; + + if (framework::TransToProtoVarType(t->dtype()) == + framework::proto::VarType::INT32) { + pt.data.Reset(t->data(), t->numel() * sizeof(int32_t)); + pt.dtype = PaddleDType::INT32; + } else if (framework::TransToProtoVarType(t->dtype()) == + framework::proto::VarType::INT64) { + pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); + pt.dtype = PaddleDType::INT64; + } else if (framework::TransToProtoVarType(t->dtype()) == + framework::proto::VarType::FP32) { + pt.data.Reset(t->data(), t->numel() * sizeof(float)); + pt.dtype = PaddleDType::FLOAT32; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported tensor date type. Now only supports INT64, FP32, INT32.")); + } + pt.shape = phi::vectorize(t->dims()); + return pt; +} + +static bool PaddleTensorToDenseTensor(const PaddleTensor &pt, + DenseTensor *t, + const platform::Place &place) { + framework::DDim ddim = phi::make_ddim(pt.shape); + void *input_ptr; + if (pt.dtype == PaddleDType::INT64) { + input_ptr = t->mutable_data(ddim, place); + } else if (pt.dtype == PaddleDType::FLOAT32) { + input_ptr = t->mutable_data(ddim, place); + } else if (pt.dtype == PaddleDType::INT32) { + input_ptr = t->mutable_data(ddim, place); + } else if (pt.dtype == PaddleDType::FLOAT16) { + input_ptr = t->mutable_data(ddim, place); + } else { + LOG(ERROR) << "unsupported feed type " << pt.dtype; + return false; + } + + PADDLE_ENFORCE_NOT_NULL( + input_ptr, + paddle::platform::errors::Fatal( + "Cannot convert to LoDTensor because LoDTensor creation failed.")); + PADDLE_ENFORCE_NOT_NULL( + pt.data.data(), + paddle::platform::errors::InvalidArgument( + "The data contained in the input PaddleTensor is illegal.")); + + if (platform::is_cpu_place(place)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy( + static_cast(input_ptr), pt.data.data(), pt.data.length()); + } else if (platform::is_ipu_place(place)) { +#ifdef PADDLE_WITH_IPU + std::memcpy( + static_cast(input_ptr), pt.data.data(), pt.data.length()); +#else + PADDLE_THROW(paddle::platform::errors::Fatal( + "Not compile with WITH_IPU, should not reach here.")); +#endif + } else if (platform::is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), + false, + platform::errors::InvalidArgument( + "Only one choice can be made between CPU and XPU.")); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast(pool.Get(place)); + auto dst_gpu_place = place; + memory::Copy(dst_gpu_place, + static_cast(input_ptr), + platform::CPUPlace(), + pt.data.data(), + pt.data.length(), + dev_ctx->stream()); +#else + PADDLE_THROW(paddle::platform::errors::Fatal( + "Not compile with CUDA, should not reach here.")); +#endif + } else if (platform::is_xpu_place(place)) { +#ifdef PADDLE_WITH_XPU + auto dst_xpu_place = place; + memory::Copy(dst_xpu_place, + static_cast(input_ptr), + platform::CPUPlace(), + pt.data.data(), + pt.data.length()); +#else + PADDLE_THROW(paddle::platform::errors::Fatal( + "Not compile with XPU, should not reach here.")); +#endif + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The analysis predictor supports CPU, GPU and XPU now.")); + } + return true; +} + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/jit/engine/predictor_engine.h b/paddle/fluid/jit/engine/predictor_engine.h new file mode 100644 index 0000000000000..026b012cbfb02 --- /dev/null +++ b/paddle/fluid/jit/engine/predictor_engine.h @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/jit/engine/base_engine.h" +#include "paddle/fluid/jit/function_schema.h" +#include "paddle/fluid/jit/function_utils.h" + +namespace paddle { +class AnalysisPredictor; + +namespace framework { +class Scope; +} + +namespace jit { + +class PredictorEngine : public BaseEngine { + public: + PredictorEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place); + + ~PredictorEngine() noexcept {} + + std::vector operator()(const std::vector &inputs); + + std::vector operator()(const std::vector &inputs); + + private: + std::shared_ptr info_; + std::shared_ptr scope_; + phi::Place place_; + std::shared_ptr predictor_; +}; + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/jit/function_schema.cc b/paddle/fluid/jit/function_schema.cc index 8150d3b2e7589..0d2014153e1d7 100644 --- a/paddle/fluid/jit/function_schema.cc +++ b/paddle/fluid/jit/function_schema.cc @@ -82,6 +82,14 @@ const std::vector FunctionInfo::OutputArgNames() const { return schema_.OutputArgNames(); } +const std::string& FunctionInfo::ProgramFilePath() const { + return prog_file_path_; +} + +void FunctionInfo::SetProgramFilePath(const std::string& path) { + prog_file_path_ = path; +} + void FunctionInfo::RemoveDescFeedFetch() { utils::RemoveFeedFetch(program_desc_.get()); } diff --git a/paddle/fluid/jit/function_schema.h b/paddle/fluid/jit/function_schema.h index 9f593dd7eee24..1a760805584ed 100644 --- a/paddle/fluid/jit/function_schema.h +++ b/paddle/fluid/jit/function_schema.h @@ -72,6 +72,10 @@ class FunctionInfo { const std::vector OutputArgNames() const; + const std::string& ProgramFilePath() const; + + void SetProgramFilePath(const std::string& path); + void RemoveDescFeedFetch(); private: @@ -79,6 +83,7 @@ class FunctionInfo { std::vector param_names_; std::shared_ptr program_desc_; FunctionSchema schema_; + std::string prog_file_path_; }; } // namespace jit diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc index 8e8bb370e81c4..9c819c52718c0 100644 --- a/paddle/fluid/jit/serializer.cc +++ b/paddle/fluid/jit/serializer.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/jit/engine/executor_engine.h" #include "paddle/fluid/jit/engine/interpreter_engine.h" #include "paddle/fluid/jit/engine/pe_engine.h" +#include "paddle/fluid/jit/engine/predictor_engine.h" #include "paddle/fluid/jit/layer.h" #include "paddle/fluid/jit/property.h" #include "paddle/fluid/jit/serializer_utils.h" @@ -54,6 +55,7 @@ Layer Deserializer::operator()(const std::string& path, param_names_set.insert(persist_var_names.begin(), persist_var_names.end()); info_map[func_name] = std::make_shared( func_name, persist_var_names, program_desc); + info_map[func_name]->SetProgramFilePath(it.second); } VariableMap params_dict; @@ -70,22 +72,23 @@ Layer Deserializer::operator()(const std::string& path, for (auto it = info_map.begin(); it != info_map.end(); ++it) { const std::string& func_name = it->first; auto& info = it->second; + VLOG(3) << "Add function type: " << FLAGS_jit_engine_type + << " Function name: " << func_name; if (FLAGS_jit_engine_type == "Executor") { - VLOG(3) << "Add function type: ExecutorEngine. Function name: " - << func_name; layer.SetEngine( func_name, utils::MakeEngine(info, params_dict, place)); } else if (FLAGS_jit_engine_type == "PE") { - VLOG(3) << "Add function type: PEEngine. Function name: " << func_name; layer.SetEngine(func_name, utils::MakeEngine(info, params_dict, place)); } else if (FLAGS_jit_engine_type == "New") { - VLOG(3) << "Add function type: InterpreterEngine. Function name: " - << func_name; layer.SetEngine( func_name, utils::MakeEngine(info, params_dict, place)); + } else if (FLAGS_jit_engine_type == "Predictor") { + layer.SetEngine( + info->FunctionName(), + utils::MakeEngine(info, params_dict, place)); } else { PD_THROW("Invalid JitLayer engine type."); } diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index c94b0c93eb34a..e29b3f6639f1e 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -17,6 +17,10 @@ foreach(src ${OPS}) ${COLLECTIVE_COMPILE_FLAGS}) endforeach() +if(WITH_GLOO) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) +endif() + register_operators( EXCLUDES c_gen_bkcl_id_op @@ -35,10 +39,6 @@ if(WITH_NCCL OR WITH_RCCL) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() -if(WITH_GLOO) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) -endif() - if(WITH_XPU_BKCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt index fe36afd96c5e8..06281b6f376fd 100644 --- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt +++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt @@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() register_operators() + +if(WITH_UNITY_BUILD) + target_link_libraries(paddle_operators_sequence_ops_unity sequence_pooling) +endif() diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 813171240da06..bac075c1d9053 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1010,13 +1010,14 @@ PADDLE_DEFINE_EXPORTED_bool( * Name: FLAGS_jit_engine_type * Since Version: 2.3.0 * Value Range: string, {Executor, PE}, - * default=PE + * default=Predictor * Example: * Note: * FLAGS_jit_engine_type == Executor, using ExecutorEngine by default * FLAGS_jit_engine_type == PE, using PEEngine by default * FLAGS_jit_engine_type == New, using InterpreterEngine by default + * FLAGS_jit_engine_type == Predictor, using inference Predictor by default */ PADDLE_DEFINE_EXPORTED_string(jit_engine_type, - "PE", + "Predictor", "Choose default funciton type in JitLayer."); From 800e05346a69f94bb594609fd1c5532715f37e15 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Fri, 28 Oct 2022 10:16:09 +0800 Subject: [PATCH 14/91] fix pragma-pack warning on macos (#47399) --- cmake/flags.cmake | 3 ++- paddle/fluid/operators/math/bloomfilter.h | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 4c26366c5acf1..0267f251e490f 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -221,7 +221,8 @@ if(APPLE) -Werror=braced-scalar-init -Werror=uninitialized -Werror=tautological-constant-out-of-range-compare - -Werror=literal-conversion) + -Werror=literal-conversion + -Werror=pragma-pack) endif() if(WITH_HETERPS AND WITH_PSLIB) diff --git a/paddle/fluid/operators/math/bloomfilter.h b/paddle/fluid/operators/math/bloomfilter.h index ba907b5012b3c..ce019a30ed9f5 100644 --- a/paddle/fluid/operators/math/bloomfilter.h +++ b/paddle/fluid/operators/math/bloomfilter.h @@ -26,7 +26,7 @@ namespace paddle { namespace operators { namespace math { -#pragma pack(4) +#pragma pack(push, 4) struct bloomfilter { uint64_t magic_num; uint64_t m; @@ -34,6 +34,8 @@ struct bloomfilter { uint64_t count; unsigned char bit_vector[1]; }; +#pragma pack(pop) + int bloomfilter_get(const struct bloomfilter *bloomfilter, const void *key, size_t len); From 6b77bfff91281c542d9dd244d580392501ef37de Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Fri, 28 Oct 2022 11:03:16 +0800 Subject: [PATCH 15/91] fix default setting of dygraph PTQ (#47413) --- python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py index 22c30b3166cee..67b7c1073e42c 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py @@ -140,7 +140,7 @@ def save_quantized_model(self, model, path, input_spec=None, **config): assert isinstance( model, paddle.nn.Layer ), "The model must be the instance of paddle.nn.Layer." - is_postprocess = config.get('postprocess', False) + is_postprocess = config.get('postprocess', True) config.pop('postprocess', None) # Convert and save dygraph quantized model From 533f6cbda3fc17814a828c88ead876775b94c2c1 Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 28 Oct 2022 11:15:06 +0800 Subject: [PATCH 16/91] Revert "Optimiza params sync between CPU and GPU. (#45805)" (#47356) This reverts commit a2b2af90593d0e45e7b122c81c6f426b39b066af. --- .../ir_params_sync_among_devices_pass.cc | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 6711c60ff86d2..1c9e5bd7b9f45 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -21,12 +21,9 @@ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" #include "paddle/phi/common/data_type.h" namespace paddle { @@ -116,27 +113,6 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { reserve_cpu_weights = true; } - int64_t params_total_bytes{0}; - for (auto *node : paddle::framework::ir::TopologySortOperations(graph)) { - if (!node->IsOp()) continue; - if (node->Op()->Type() == "feed" || node->Op()->Type() == "fetch") continue; - for (auto *var_node : node->inputs) { - if (!var_node->Var()->Persistable()) continue; - auto var_name = var_node->Var()->Name(); - auto *var = scope->FindLocalVar(var_name); - if (var->IsType()) { - auto *t = var->GetMutable(); - params_total_bytes += t->numel() * experimental::SizeOf(t->dtype()); - } - } - } - - { - // Alloc memory in pool to store all parameters. - phi::DenseTensor ts; - ts.mutable_data(place, params_total_bytes); - } - std::unordered_set visited; for (auto *node : paddle::framework::ir::TopologySortOperations(graph)) { if (!node->IsOp()) continue; From 6baeb2d1066b58be0f64d3f864b6e3aea0f5974d Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 28 Oct 2022 11:26:44 +0800 Subject: [PATCH 17/91] Generate static graph code for some activation ops by Yaml (#47382) * generate static graph code for some activation op * fix example code of cosh --- .pre-commit-config.yaml | 2 +- paddle/fluid/operators/activation_op.cc | 132 ------ paddle/phi/api/yaml/backward.yaml | 121 +++++ paddle/phi/api/yaml/legacy_backward.yaml | 121 ----- paddle/phi/api/yaml/legacy_ops.yaml | 179 ++------ paddle/phi/api/yaml/op_compat.yaml | 56 ++- paddle/phi/api/yaml/ops.yaml | 127 +++++- paddle/phi/ops/compat/activation_sig.cc | 22 - python/paddle/tensor/ops.py | 556 ++++++++++++++++------- 9 files changed, 718 insertions(+), 598 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c551cae5791f0..336617ee49cb3 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,7 +46,7 @@ repos: - id: detect-private-key - id: end-of-file-fixer - id: sort-simple-yaml - files: (op|backward|op_[a-z_]+)\.yaml$ + files: (ops|backward|op_[a-z_]+)\.yaml$ - id: trailing-whitespace files: (.*\.(py|bzl|md|rst|c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps|cmake|yaml|yml|hook)|BUILD|.*\.BUILD|WORKSPACE|CMakeLists\.txt)$ - repo: local diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index d75ae20da3ad9..a444812ed99f8 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -221,70 +221,6 @@ Floor Activation Operator. Computes floor of x element-wise. )DOC"; -UNUSED constexpr char CosDoc[] = R"DOC( -Cosine Operator. Computes cosine of x element-wise. - -Input range is `(-inf, inf)` and output range is `[-1,1]`. - -.. math:: - out = cos(x) - -)DOC"; - -UNUSED constexpr char TanDoc[] = R"DOC( -Tangent Operator. Computes tangent of x element-wise. - -Input range is `(k*pi-pi/2, k*pi+pi/2)` and output range is `(-inf, inf)`. - -$$out = tan(x)$$ - -)DOC"; - -UNUSED constexpr char SinDoc[] = R"DOC( -Sine Activation Operator. - -$$out = sin(x)$$ - -)DOC"; - -UNUSED constexpr char SinhDoc[] = R"DOC( -Sinh Activation Operator. - -$$out = sinh(x)$$ - -)DOC"; - -UNUSED constexpr char CoshDoc[] = R"DOC( -Cosh Activation Operator. - -Input range `(-inf, inf)`, output range `(1, inf)`. - -.. math:: - out = \frac{exp(x)+exp(-x)}{2} - -)DOC"; - -UNUSED constexpr char AsinhDoc[] = R"DOC( -Asinh Activation Operator. - -$$out = asinh(x)$$ - -)DOC"; - -UNUSED constexpr char AcoshDoc[] = R"DOC( -Acosh Activation Operator. - -$$out = acosh(x)$$ - -)DOC"; - -UNUSED constexpr char AtanhDoc[] = R"DOC( -Atanh Activation Operator. - -$$out = atanh(x)$$ - -)DOC"; - UNUSED constexpr char RoundDoc[] = R"DOC( The OP rounds the values in the input to the nearest integer value. @@ -357,55 +293,6 @@ Softsign Activation Operator. )DOC"; -class AcosOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "Input of acos operator"); - AddOutput("Out", "Tensor, same shape and dtype as input"); - AddComment(R"DOC( -Arccosine Operator. - -.. math:: - out = \cos^{-1}(x) - -)DOC"); - } -}; - -class AsinOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "Input of asin operator, an N-D Tensor, with data type float32, " - "float64 or float16."); - AddOutput("Out", "Tensor, same shape and dtype as input."); - AddComment(R"DOC( -Arcsine Operator. - -.. math:: - out = \sin^{-1}(x) - -)DOC"); - } -}; - -class AtanOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "Input of atan operator, an N-D Tensor, with data type float32, " - "float64 or float16."); - AddOutput("Out", "Tensor, same shape and dtype as input x"); - AddComment(R"DOC( -Arctangent Operator. - -.. math:: - out = \tan^{-1}(x) - -)DOC"); - } -}; - class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -807,14 +694,6 @@ REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc); REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc); -REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc); -REGISTER_ACTIVATION_OP_MAKER(Tan, TanDoc); -REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc); -REGISTER_ACTIVATION_OP_MAKER(Sinh, SinhDoc); -REGISTER_ACTIVATION_OP_MAKER(Cosh, CoshDoc); -REGISTER_ACTIVATION_OP_MAKER(Acosh, AcoshDoc); -REGISTER_ACTIVATION_OP_MAKER(Asinh, AsinhDoc); -REGISTER_ACTIVATION_OP_MAKER(Atanh, AtanhDoc); REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc); REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc); REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); @@ -1388,17 +1267,6 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); -REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) -REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); -REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); -REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); -REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); -REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); -REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); -REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); -REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); -REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); -REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 6a14d8e02902a..db97795b5b425 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1,3 +1,47 @@ +- backward_op : acos_grad + forward : acos (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : acos_grad + inplace : (out_grad -> x_grad) + +- backward_op : acosh_grad + forward : acosh (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : acosh_grad + inplace : (out_grad -> x_grad) + +- backward_op : asin_grad + forward : asin (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : asin_grad + inplace : (out_grad -> x_grad) + +- backward_op : asinh_grad + forward : asinh (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : asinh_grad + inplace : (out_grad -> x_grad) + - backward_op : atan2_grad forward : atan2 (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) @@ -8,6 +52,28 @@ kernel : func : atan2_grad +- backward_op : atan_grad + forward : atan (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : atan_grad + inplace : (out_grad -> x_grad) + +- backward_op : atanh_grad + forward : atanh (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : atanh_grad + inplace : (out_grad -> x_grad) + - backward_op : cholesky_grad forward : cholesky (Tensor x, bool upper) -> Tensor(out) args : (Tensor out, Tensor out_grad, bool upper) @@ -28,6 +94,28 @@ kernel : func : cholesky_solve_grad +- backward_op : cos_grad + forward : cos (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : cos_grad + inplace : (out_grad -> x_grad) + +- backward_op : cosh_grad + forward : cosh (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : cosh_grad + inplace : (out_grad -> x_grad) + - backward_op : cross_grad forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis) @@ -205,6 +293,28 @@ kernel : func : poisson_grad +- backward_op : sin_grad + forward : sin (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : sin_grad + inplace : (out_grad -> x_grad) + +- backward_op : sinh_grad + forward : sinh (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : sinh_grad + inplace : (out_grad -> x_grad) + - backward_op : solve_grad forward : solve (Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out, Tensor out_grad) @@ -215,6 +325,17 @@ kernel : func : solve_grad +- backward_op : tan_grad + forward : tan (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : tan_grad + inplace : (out_grad -> x_grad) + - backward_op : trace_grad forward : trace (Tensor x, int offset, int axis1, int axis2) -> Tensor(out) args : (Tensor x, Tensor out_grad, int offset, int axis1, int axis2) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 499597897fd47..ced3d75bb9639 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -21,28 +21,6 @@ func : abs_grad backward : abs_double_grad -- backward_op : acos_grad - forward : acos (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : acos_grad - inplace : (out_grad -> x_grad) - -- backward_op : acosh_grad - forward : acosh (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : acosh_grad - inplace : (out_grad -> x_grad) - - backward_op : add_double_grad forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) @@ -158,28 +136,6 @@ output : Tensor(x_grad) invoke : as_complex(out_grad) -- backward_op : asin_grad - forward : asin (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : asin_grad - inplace : (out_grad -> x_grad) - -- backward_op : asinh_grad - forward : asinh (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : asinh_grad - inplace : (out_grad -> x_grad) - - backward_op : assign_grad forward : assign (Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -196,28 +152,6 @@ func : assign inplace : (out_grad -> x_grad) -- backward_op : atan_grad - forward : atan (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : atan_grad - inplace : (out_grad -> x_grad) - -- backward_op : atanh_grad - forward : atanh (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : atanh_grad - inplace : (out_grad -> x_grad) - - backward_op : batch_norm_double_grad forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias) args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) @@ -500,28 +434,6 @@ func : conv3d_transpose_grad use_gpudnn : true -- backward_op : cos_grad - forward : cos (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : cos_grad - inplace : (out_grad -> x_grad) - -- backward_op : cosh_grad - forward : cosh (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : cosh_grad - inplace : (out_grad -> x_grad) - - backward_op : crop_tensor_grad forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out) args : (Tensor x, Tensor out_grad, IntArray offsets) @@ -2106,28 +2018,6 @@ func : silu_grad inplace : (out_grad -> x_grad) -- backward_op : sin_grad - forward : sin (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : sin_grad - inplace : (out_grad -> x_grad) - -- backward_op : sinh_grad - forward : sinh (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : sinh_grad - inplace : (out_grad -> x_grad) - - backward_op : slice_double_grad forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input) args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) @@ -2408,17 +2298,6 @@ kernel : func : take_along_axis_grad -- backward_op : tan_grad - forward : tan (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : tan_grad - inplace : (out_grad -> x_grad) - - backward_op : tanh_double_grad forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) args : (Tensor out, Tensor grad_out, Tensor grad_x_grad) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 4a0d31702494a..b0d79886c14dd 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -19,24 +19,6 @@ func : accuracy dtype : x -- op : acos - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : acos - backward : acos_grad - -- op : acosh - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : acosh - backward : acosh_grad - - op : adadelta_ args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon) output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out) @@ -236,24 +218,6 @@ func : as_real backward : as_real_grad -- op : asin - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : asin - backward : asin_grad - -- op : asinh - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : asinh - backward : asinh_grad - - op : assign args : (Tensor x) output : Tensor @@ -288,24 +252,6 @@ data_type : dtype backend : place > output -- op : atan - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : atan - backward : atan_grad - -- op : atanh - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : atanh - backward : atanh_grad - - op : auc args : (Tensor x, Tensor label, Tensor stat_pos, Tensor stat_neg, Tensor ins_tag_weight, str curve, int num_thresholds, int slide_steps) output : Tensor(auc), Tensor(stat_pos_out), Tensor(stat_neg_out) @@ -589,24 +535,6 @@ output : Tensor(out) invoke : copy_to_impl(x, place, blocking) -- op : cos - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : cos - backward : cos_grad - -- op : cosh - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : cosh - backward : cosh_grad - - op : crop_tensor args : (Tensor x, IntArray shape, IntArray offsets) output : Tensor(out) @@ -1939,6 +1867,16 @@ kernel : func : not_equal +- op : numel + args : (Tensor x) + output : Tensor(size) + infer_meta : + func : SizeInferMeta + kernel : + func : size + data_transform: + skip_transform : x + - op : one_hot args : (Tensor x, Scalar(int) num_classes) output : Tensor(out) @@ -2402,34 +2340,6 @@ func : silu backward : silu_grad -- op : sin - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : sin - backward : sin_grad - -- op : sinh - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : sinh - backward : sinh_grad - -- op : numel - args : (Tensor x) - output : Tensor(size) - infer_meta : - func : SizeInferMeta - kernel : - func : size - data_transform: - skip_transform : x - - op : slice args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) output : Tensor @@ -2448,16 +2358,6 @@ func : slogdeterminant backward : slogdet_grad -- op : softshrink - args : (Tensor x, float threshold) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : soft_shrink - backward : softshrink_grad - - op : softmax args : (Tensor x, int axis) output : Tensor(out) @@ -2479,6 +2379,16 @@ func : softplus backward : softplus_grad +- op : softshrink + args : (Tensor x, float threshold) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : soft_shrink + backward : softshrink_grad + - op : softsign args : (Tensor x) output : Tensor @@ -2637,15 +2547,6 @@ data_type : arr backward : take_along_axis_grad -- op : tan - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : tan - backward : tan_grad - - op : tanh args : (Tensor x) output : Tensor(out) @@ -2777,17 +2678,6 @@ backend : place data_type : dtype -- op : update_loss_scaling_ - args : (Tensor[] x, Tensor found_infinite, Tensor prev_loss_scaling, Tensor in_good_steps, Tensor in_bad_steps, int incr_every_n_steps, int decr_every_n_nan_or_inf, float incr_ratio, float decr_ratio, Scalar stop_update) - output : Tensor[](out){x.size()}, Tensor(loss_scaling), Tensor(out_good_steps), Tensor(out_bad_steps) - infer_meta : - func : UpdateLossScalingInferMeta - param : [x, found_infinite, prev_loss_scaling, in_good_steps, in_bad_steps] - kernel : - func : update_loss_scaling - data_type : x - inplace : (x -> out), (prev_loss_scaling -> loss_scaling), (in_good_steps -> out_good_steps), (in_bad_steps -> out_bad_steps) - - op : unbind args : (Tensor input, int axis) output : Tensor[] {axis<0 ? input.dims()[input.dims().size()+axis]:input.dims()[axis]} @@ -2858,6 +2748,17 @@ func : unstack backward : unstack_grad +- op : update_loss_scaling_ + args : (Tensor[] x, Tensor found_infinite, Tensor prev_loss_scaling, Tensor in_good_steps, Tensor in_bad_steps, int incr_every_n_steps, int decr_every_n_nan_or_inf, float incr_ratio, float decr_ratio, Scalar stop_update) + output : Tensor[](out){x.size()}, Tensor(loss_scaling), Tensor(out_good_steps), Tensor(out_bad_steps) + infer_meta : + func : UpdateLossScalingInferMeta + param : [x, found_infinite, prev_loss_scaling, in_good_steps, in_bad_steps] + kernel : + func : update_loss_scaling + data_type : x + inplace : (x -> out), (prev_loss_scaling -> loss_scaling), (in_good_steps -> out_good_steps), (in_bad_steps -> out_bad_steps) + - op : viterbi_decode args : (Tensor potentials, Tensor transition_params, Tensor lengths, bool include_bos_eos_tag) output : Tensor(scores), Tensor(path) @@ -2926,6 +2827,15 @@ output : Tensor(out) invoke : full_like(x, 0, dtype, place) +- op: bincount + args: (Tensor x, Tensor weights, Scalar minlength) + output: Tensor(out) + infer_meta: + func: BincountInferMeta + kernel: + func: bincount + optional: weights + - op: broadcast_tensors args: (Tensor[] input) output: Tensor[]{input.size()} @@ -3015,12 +2925,3 @@ func: unpool3d data_type: x backward: unpool3d_grad - -- op: bincount - args: (Tensor x, Tensor weights, Scalar minlength) - output: Tensor(out) - infer_meta: - func: BincountInferMeta - kernel: - func: bincount - optional: weights diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index cf47c79cb6d67..59d258f0b0a88 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -8,7 +8,17 @@ extra : attrs : [bool use_mkldnn = false] +- op : acos + inputs : + x : X + outputs : + out : Out + - op : acosh + inputs : + x : X + outputs : + out : Out backward : acosh_grad extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -34,11 +44,27 @@ extra : attrs : [bool use_mkldnn = false] +- op : asin + inputs : + x : X + outputs : + out : Out + - op : asinh backward : asinh_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] +- op : atan + inputs : + x : X + outputs : + out : Out + - op : atan2 inputs : {x : X1, y : X2} @@ -47,6 +73,10 @@ - op : atanh backward : atanh_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -145,11 +175,19 @@ - op : cos backward : cos_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : cosh backward : cosh_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -271,14 +309,12 @@ - op : exp backward : exp_grad - extra : - attrs : [bool use_mkldnn = false, bool use_cudnn = false] - -- op : exp inputs : x : X outputs : out : Out + extra : + attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : expand (expand_v2) backward : expand_grad (expand_v2_grad) @@ -670,11 +706,19 @@ - op : sin backward : sin_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : sinh backward : sinh_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -748,6 +792,10 @@ - op : tan backward : tan_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 4d0de760ace5d..ec1ba17be672d 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -1,3 +1,48 @@ +- op : acos + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : acos + backward : acos_grad + +- op : acosh + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : acosh + backward : acosh_grad + +- op : asin + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : asin + backward : asin_grad + +- op : asinh + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : asinh + backward : asinh_grad + +- op : atan + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : atan + backward : atan_grad + - op : atan2 args : (Tensor x, Tensor y) output : Tensor @@ -7,6 +52,15 @@ func : atan2 backward : atan2_grad +- op : atanh + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : atanh + backward : atanh_grad + - op : bernoulli args : (Tensor x) output : Tensor(out) @@ -33,15 +87,23 @@ func : cholesky_solve backward : cholesky_solve_grad -- op : exp +- op : cos args : (Tensor x) - output : Tensor(out) + output : Tensor infer_meta : func : UnchangedInferMeta kernel : - func : exp - inplace : (x -> out) - backward : exp_grad + func : cos + backward : cos_grad + +- op : cosh + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : cosh + backward : cosh_grad - op : cross args : (Tensor x, Tensor y, int axis = 9) @@ -118,6 +180,16 @@ inplace : (x -> out) backward : erfinv_grad +- op : exp + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : exp + inplace : (x -> out) + backward : exp_grad + - op : fft_c2c args : (Tensor x, int64_t[] axes, str normalization, bool forward) output : Tensor @@ -145,6 +217,15 @@ func : fft_r2c backward : fft_r2c_grad +- op : flip + args : (Tensor x, int[] axis) + output : Tensor (out) + infer_meta : + func : FlipInferMeta + kernel : + func : flip + backward : flip_grad + - op : graph_send_uv args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") output : Tensor(out) @@ -182,6 +263,24 @@ func : poisson backward : poisson_grad +- op : sin + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : sin + backward : sin_grad + +- op : sinh + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : sinh + backward : sinh_grad + - op : solve args : (Tensor x, Tensor y) output : Tensor @@ -192,6 +291,15 @@ data_type : x backward : solve_grad +- op : tan + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : tan + backward : tan_grad + - op : trace args : (Tensor x, int offset = 0, int axis1 = 0, int axis2 = 1) output : Tensor @@ -209,12 +317,3 @@ kernel : func : trunc backward : trunc_grad - -- op : flip - args : (Tensor x, int[] axis) - output : Tensor (out) - infer_meta : - func : FlipInferMeta - kernel : - func : flip - backward : flip_grad diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index d555e2a933967..e70e5f72d44c3 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -39,17 +39,6 @@ namespace phi { #define comma , -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Square, "square", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max"); @@ -240,17 +229,6 @@ PD_REGISTER_BASE_KERNEL_NAME(rsqrt_grad_grad, rsqrt_double_grad); PD_REGISTER_BASE_KERNEL_NAME(celu_grad_grad, celu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(square_grad_grad, square_double_grad); -PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping); diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py index a0a0f01486b62..3aa63e2e09387 100644 --- a/python/paddle/tensor/ops.py +++ b/python/paddle/tensor/ops.py @@ -40,25 +40,14 @@ __unary_func__ = [ 'expm1', - 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', - 'cos', - 'tan', - 'acos', - 'sin', - 'sinh', - 'asin', - 'cosh', 'round', 'reciprocal', 'square', - 'acosh', - 'asinh', - 'atanh', ] __inplace_unary_func__ = [ @@ -191,22 +180,6 @@ """, ) -add_sample_code( - globals()["atan"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.atan(x) - print(out) - # [-0.38050638 -0.19739556 0.09966865 0.29145679] - -""", -) - add_sample_code( globals()["tanh_shrink"], r""" @@ -305,23 +278,23 @@ ) add_sample_code( - globals()["cos"], + globals()["round"], r""" Examples: .. code-block:: python import paddle - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.cos(x) + x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5]) + out = paddle.round(x) print(out) - # [0.92106099 0.98006658 0.99500417 0.95533649] + # [-1. -0. 1. 2.] """, ) add_sample_code( - globals()["tan"], + globals()["reciprocal"], r""" Examples: .. code-block:: python @@ -329,15 +302,15 @@ import paddle x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.tan(x) + out = paddle.reciprocal(x) print(out) - # [-0.42279324, -0.20271005, 0.10033467, 0.30933627] + # [-2.5 -5. 10. 3.33333333] """, ) add_sample_code( - globals()["acos"], + globals()["square"], r""" Examples: .. code-block:: python @@ -345,206 +318,346 @@ import paddle x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.acos(x) + out = paddle.square(x) print(out) - # [1.98231317 1.77215425 1.47062891 1.26610367] + # [0.16 0.04 0.01 0.09] """, ) add_sample_code( - globals()["sin"], + globals()["softplus"], r""" Examples: .. code-block:: python import paddle + import paddle.nn.functional as F x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.sin(x) + out = F.softplus(x) print(out) - # [-0.38941834 -0.19866933 0.09983342 0.29552021] + # [0.513015, 0.598139, 0.744397, 0.854355] """, ) add_sample_code( - globals()["asin"], + globals()["softsign"], r""" Examples: .. code-block:: python import paddle + import paddle.nn.functional as F x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.asin(x) + out = F.softsign(x) print(out) - # [-0.41151685 -0.20135792 0.10016742 0.30469265] + # [-0.285714, -0.166667, 0.0909091, 0.230769] """, ) -add_sample_code( - globals()["cosh"], - r""" -Examples: - .. code-block:: python - import paddle +def acos(x, name=None): + """ + Acos Activation Operator. - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.cosh(x) - print(out) - # [1.08107237 1.02006676 1.00500417 1.04533851] + .. math:: + out = cos^{-1}(x) -""", -) + Args: + x (Tensor): Input of Acos operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. -add_sample_code( - globals()["sinh"], - r""" -Examples: - .. code-block:: python + Returns: + Tensor. Output of Acos operator, a Tensor with shape same as input. - import paddle + Examples: + .. code-block:: python - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.sinh(x) - print(out) - # [-0.41075233 -0.201336 0.10016675 0.30452029] + import paddle -""", -) + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.acos(x) + print(out) + # [1.98231317 1.77215425 1.47062891 1.26610367] -add_sample_code( - globals()["asinh"], - r""" -Examples: - .. code-block:: python + """ + if in_dygraph_mode(): + return _C_ops.acos(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.acos(x) - import paddle + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'acos') + helper = LayerHelper('acos', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='acos', inputs={"X": x}, outputs={"Out": out}) + return out - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.asinh(x) - print(out) - # [-0.39003533, -0.19869010, 0.09983408, 0.29567307] -""", -) +def acosh(x, name=None): + """ + Acosh Activation Operator. -add_sample_code( - globals()["acosh"], - r""" -Examples: - .. code-block:: python + .. math:: + out = acosh(x) - import paddle + Args: + x (Tensor): Input of Acosh operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - x = paddle.to_tensor([1., 3., 4., 5.]) - out = paddle.acosh(x) - print(out) - # [0. , 1.76274729, 2.06343699, 2.29243159] + Returns: + Tensor. Output of Acosh operator, a Tensor with shape same as input. -""", -) + Examples: + .. code-block:: python -add_sample_code( - globals()["atanh"], - r""" -Examples: - .. code-block:: python + import paddle - import paddle + x = paddle.to_tensor([1., 3., 4., 5.]) + out = paddle.acosh(x) + print(out) + # [0. , 1.76274729, 2.06343699, 2.29243159] - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.atanh(x) - print(out) - # [-0.42364895, -0.20273256, 0.10033535, 0.30951962] + """ + if in_dygraph_mode(): + return _C_ops.acosh(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.acosh(x) -""", -) + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'acosh') + helper = LayerHelper('acosh', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='acosh', inputs={"X": x}, outputs={"Out": out}) + return out -add_sample_code( - globals()["round"], - r""" -Examples: - .. code-block:: python - import paddle +def asin(x, name=None): + """ + Arcsine Operator. - x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5]) - out = paddle.round(x) - print(out) - # [-1. -0. 1. 2.] + .. math:: + out = sin^{-1}(x) -""", -) + Args: + x (Tensor): Input of Asin operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. -add_sample_code( - globals()["reciprocal"], - r""" -Examples: - .. code-block:: python + Returns: + Tensor. Same shape and dtype as input. - import paddle + Examples: + .. code-block:: python - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.reciprocal(x) - print(out) - # [-2.5 -5. 10. 3.33333333] + import paddle -""", -) + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.asin(x) + print(out) + # [-0.41151685 -0.20135792 0.10016742 0.30469265] -add_sample_code( - globals()["square"], - r""" -Examples: - .. code-block:: python + """ + if in_dygraph_mode(): + return _C_ops.asin(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.asin(x) - import paddle + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'asin') + helper = LayerHelper('asin', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='asin', inputs={"X": x}, outputs={"Out": out}) + return out - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.square(x) - print(out) - # [0.16 0.04 0.01 0.09] -""", -) +def asinh(x, name=None): + """ + Asinh Activation Operator. -add_sample_code( - globals()["softplus"], - r""" -Examples: - .. code-block:: python + .. math:: + out = asinh(x) - import paddle - import paddle.nn.functional as F + Args: + x (Tensor): Input of Asinh operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.softplus(x) - print(out) - # [0.513015, 0.598139, 0.744397, 0.854355] + Returns: + Tensor. Output of Asinh operator, a Tensor with shape same as input. -""", -) + Examples: + .. code-block:: python -add_sample_code( - globals()["softsign"], - r""" -Examples: - .. code-block:: python + import paddle - import paddle - import paddle.nn.functional as F + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.asinh(x) + print(out) + # [-0.39003533, -0.19869010, 0.09983408, 0.29567307] - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.softsign(x) - print(out) - # [-0.285714, -0.166667, 0.0909091, 0.230769] + """ + if in_dygraph_mode(): + return _C_ops.asinh(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.asinh(x) -""", -) + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'asinh') + helper = LayerHelper('asinh', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='asinh', inputs={"X": x}, outputs={"Out": out}) + return out + + +def atan(x, name=None): + """ + Arctangent Operator. + + .. math:: + out = tan^{-1}(x) + + Args: + x (Tensor): Input of Atan operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Same shape and dtype as input x. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.atan(x) + print(out) + # [-0.38050638 -0.19739556 0.09966865 0.29145679] + + """ + if in_dygraph_mode(): + return _C_ops.atan(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.atan(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'atan') + helper = LayerHelper('atan', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='atan', inputs={"X": x}, outputs={"Out": out}) + return out + + +def atanh(x, name=None): + """ + Atanh Activation Operator. + + .. math:: + out = atanh(x) + + Args: + x (Tensor): Input of Atan operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Atanh operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.atanh(x) + print(out) + # [-0.42364895, -0.20273256, 0.10033535, 0.30951962] + + """ + if in_dygraph_mode(): + return _C_ops.atanh(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.atanh(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'atanh') + helper = LayerHelper('atanh', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='atanh', inputs={"X": x}, outputs={"Out": out}) + return out + + +def cos(x, name=None): + """ + Cosine Operator. Computes cosine of x element-wise. + + Input range is `(-inf, inf)` and output range is `[-1,1]`. + + .. math:: + out = cos(x) + + Args: + x (Tensor): Input of Cos operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Cos operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.cos(x) + print(out) + # [0.92106099 0.98006658 0.99500417 0.95533649] + + """ + if in_dygraph_mode(): + return _C_ops.cos(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.cos(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'cos') + helper = LayerHelper('cos', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='cos', inputs={"X": x}, outputs={"Out": out}) + return out + + +def cosh(x, name=None): + """ + Cosh Activation Operator. + + Input range `(-inf, inf)`, output range `(1, inf)`. + + .. math:: + out = \frac{exp(x)+exp(-x)}{2} + + Args: + x (Tensor): Input of Cosh operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Cosh operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.cosh(x) + print(out) + # [1.08107237 1.02006676 1.00500417 1.04533851] + + """ + if in_dygraph_mode(): + return _C_ops.cosh(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.cosh(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'cosh') + helper = LayerHelper('cosh', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='cosh', inputs={"X": x}, outputs={"Out": out}) + return out def exp(x, name=None): @@ -598,6 +711,119 @@ def exp(x, name=None): return out +def sin(x, name=None): + """ + Sine Activation Operator. + + .. math:: + out = sin(x) + + Args: + x (Tensor): Input of Sin operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Sin operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.sin(x) + print(out) + # [-0.38941834 -0.19866933 0.09983342 0.29552021] + + """ + if in_dygraph_mode(): + return _C_ops.sin(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.sin(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sin') + helper = LayerHelper('sin', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='sin', inputs={"X": x}, outputs={"Out": out}) + return out + + +def sinh(x, name=None): + """ + Sinh Activation Operator. + + .. math:: + out = sinh(x) + + Args: + x (Tensor): Input of Sinh operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Sinh operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.sinh(x) + print(out) + # [-0.41075233 -0.201336 0.10016675 0.30452029] + + """ + if in_dygraph_mode(): + return _C_ops.sinh(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.sinh(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'sinh') + helper = LayerHelper('sinh', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='sinh', inputs={"X": x}, outputs={"Out": out}) + return out + + +def tan(x, name=None): + """ + Tangent Operator. Computes tangent of x element-wise. + + Input range is `(k*pi-pi/2, k*pi+pi/2)` and output range is `(-inf, inf)`. + + .. math:: + out = tan(x) + + Args: + x (Tensor): Input of Tan operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Tan operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.tan(x) + print(out) + # [-0.42279324, -0.20271005, 0.10033467, 0.30933627] + + """ + if in_dygraph_mode(): + return _C_ops.tan(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.tan(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'tan') + helper = LayerHelper('tan', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='tan', inputs={"X": x}, outputs={"Out": out}) + return out + + __all__ += ['erf'] _erf_ = generate_layer_fn('erf') From 57d5ffa5fd340831f90d0a6f15f7cab930cd8842 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Fri, 28 Oct 2022 12:59:25 +0800 Subject: [PATCH 18/91] [Dygraph] Fix memory bugs of no sync and SplitTensors in DataParallel (#47369) * fix no sync bugs * update * update task chain fix: update wait chain feat: add `GetDeviceContext` for gloo * fix oom * fix dev * update * update Co-authored-by: LiYuRio Co-authored-by: ForFishes <2282912238@qq.com> --- .../distributed/collective/ProcessGroup.cc | 2 + .../distributed/collective/ProcessGroup.h | 3 +- .../distributed/collective/ProcessGroupGloo.h | 5 ++ .../collective/ProcessGroupNCCL.cc | 13 ++-- .../distributed/collective/ProcessGroupNCCL.h | 8 ++- .../collective/ProcessGroupStream.cc | 2 +- .../collective/ProcessGroupStream.h | 4 +- paddle/fluid/distributed/collective/Utils.h | 42 ++++++------ .../fluid/distributed/collective/reducer.cc | 64 ++++++++++++------- paddle/fluid/distributed/collective/reducer.h | 6 +- paddle/fluid/imperative/reducer.cc | 9 ++- paddle/fluid/imperative/reducer.h | 3 +- paddle/fluid/pybind/distributed_py.cc | 17 +++-- paddle/fluid/pybind/imperative.cc | 1 + python/paddle/fluid/dygraph/parallel.py | 8 +-- 15 files changed, 113 insertions(+), 74 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc index 1db8d221aa67d..e7942b714e4f6 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.cc +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -41,6 +41,8 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) { void ProcessGroup::Task::Synchronize() {} +void ProcessGroup::Task::UpdateWaitChain(const phi::DeviceContext& ctx) {} + ProcessGroup::ProcessGroup(int rank, int size, const platform::Place& place, diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index ca1cf7dd48ba7..afe75baeb2a4f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -66,6 +66,7 @@ class ProcessGroup { virtual bool IsCompleted(); virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); virtual void Synchronize(); + virtual void UpdateWaitChain(const phi::DeviceContext& ctx); bool IsSync() const { return sync_op_; } protected: @@ -92,7 +93,7 @@ class ProcessGroup { int GetSize() const { return size_; } virtual const std::string GetBackendName() const = 0; - virtual phi::DeviceContext* GetDeviceContext(const Place& place) const { + virtual const phi::DeviceContext& GetDeviceContext(const Place& place) const { PADDLE_THROW(platform::errors::InvalidArgument( "Does not support to get device_context from ProcessGroup%s.", GetBackendName())); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index d911da91eb1a3..f20f39b31a7a7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -150,6 +150,11 @@ class ProcessGroupGloo : public ProcessGroup { return GLOO_BACKEND_NAME; } + const phi::DeviceContext& GetDeviceContext( + const Place& place) const override { + return *platform::DeviceContextPool::Instance().Get(place); + } + // Helper functions for Gloo. static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname( const std::string& hostname); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 2e18dfcc3ba12..76d1d42c7d653 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -110,6 +110,11 @@ bool ProcessGroupNCCL::NCCLTask::IsCompleted() { return true; } +void ProcessGroupNCCL::NCCLTask::UpdateWaitChain( + const phi::DeviceContext& ctx) { + control_events_[0].Record(*static_cast(&ctx)); +} + void ProcessGroupNCCL::CheckSplitSizes(std::vector* split_sizes, std::vector tensor_shape) { int64_t len_size = (*split_sizes).size(); @@ -1591,15 +1596,15 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { return iter->second[0]->GetNcclComm(); } -phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( +const phi::DeviceContext& ProcessGroupNCCL::GetDeviceContext( const Place& place) const { return GetDeviceContext(place, /*use_calc_stream*/ false); } -phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( +const phi::DeviceContext& ProcessGroupNCCL::GetDeviceContext( const Place& place, bool use_calc_stream) const { if (use_calc_stream) { - return platform::DeviceContextPool::Instance().Get(place); + return *platform::DeviceContextPool::Instance().Get(place); } else { std::vector places = {place}; const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places)); @@ -1607,7 +1612,7 @@ phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( places_to_ctx_.end(), platform::errors::InvalidArgument( "Cannot find device context in process group.")); - return iter->second[0].get(); + return *iter->second[0]; } } diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 6427e9e3e2ab1..a501bf5302350 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -75,6 +75,8 @@ class ProcessGroupNCCL : public ProcessGroupStream { virtual ~NCCLTask(); + void UpdateWaitChain(const phi::DeviceContext& ctx) override; + std::vector control_events_; std::vector barrierTensors_; @@ -96,10 +98,10 @@ class ProcessGroupNCCL : public ProcessGroupStream { return std::string(NCCL_BACKEND_NAME); } - phi::DeviceContext* GetDeviceContext(const Place& place) const override; + const phi::DeviceContext& GetDeviceContext(const Place& place) const override; - phi::DeviceContext* GetDeviceContext(const Place& place, - bool use_calc_stream) const override; + const phi::DeviceContext& GetDeviceContext( + const Place& place, bool use_calc_stream) const override; std::shared_ptr AllReduce( std::vector& in_tensors, // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index b2cfae088b227..11530ab872d22 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -23,7 +23,7 @@ ProcessGroupStream::ProcessGroupStream(int rank, int gid) : ProcessGroup(rank, size, place, gid) {} -phi::DeviceContext* ProcessGroupStream::GetDeviceContext( +const phi::DeviceContext& ProcessGroupStream::GetDeviceContext( const Place& place, bool use_calc_stream) const { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support get device_context.", GetBackendName())); diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index 2f0aa139104e9..56799c4bd3ed8 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -54,8 +54,8 @@ class ProcessGroupStream : public ProcessGroup { ProcessGroupStream(int rank, int size, const platform::Place& place, int gid); virtual ~ProcessGroupStream() = default; - virtual phi::DeviceContext* GetDeviceContext(const Place& place, - bool use_calc_stream) const; + virtual const phi::DeviceContext& GetDeviceContext( + const Place& place, bool use_calc_stream) const; std::shared_ptr AllGather( std::vector& in_tensors, // NOLINT diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/distributed/collective/Utils.h index c06c0345163ed..d9260b98dcf44 100644 --- a/paddle/fluid/distributed/collective/Utils.h +++ b/paddle/fluid/distributed/collective/Utils.h @@ -25,18 +25,18 @@ namespace distributed { template struct ConcatDenseTensor { - void operator()(const DeviceContext *context, + void operator()(const DeviceContext &context, const std::vector &in, phi::DenseTensor *out, int axis = 0) { phi::funcs::ConcatFunctor concat_functor; - concat_functor(*context, in, axis, out); + concat_functor(context, in, axis, out); } }; template struct SplitDenseTensor { - void operator()(const DeviceContext *context, + void operator()(const DeviceContext &context, const phi::DenseTensor &in, std::vector *out, int axis = 0) { @@ -46,19 +46,19 @@ struct SplitDenseTensor { shape_refer.emplace_back(p_tensor); } phi::funcs::SplitFunctor split_functor; - split_functor(*context, in, shape_refer, axis, out); + split_functor(context, in, shape_refer, axis, out); } }; #ifdef PADDLE_WITH_CUSTOM_DEVICE template struct ConcatDenseTensor { - void operator()(const platform::CustomDeviceContext *context, + void operator()(const platform::CustomDeviceContext &context, const std::vector &in, phi::DenseTensor *out, int axis = 0) { auto *out_data = out->data(); - auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace()); + auto *device = phi::DeviceManager::GetDeviceWithPlace(context.GetPlace()); size_t offset = 0; for (const auto &tensor : in) { const auto *in_data = tensor.data(); @@ -71,12 +71,12 @@ struct ConcatDenseTensor { template struct SplitDenseTensor { - void operator()(const platform::CustomDeviceContext *context, + void operator()(const platform::CustomDeviceContext &context, const phi::DenseTensor &in, std::vector *out, int axis = 0) { auto *in_data = in.data(); - auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace()); + auto *device = phi::DeviceManager::GetDeviceWithPlace(context.GetPlace()); size_t offset = 0; for (auto *p_tensor : *out) { auto *out_data = p_tensor->data(); @@ -89,7 +89,7 @@ struct SplitDenseTensor { #endif template -void ConcatDenseTensorWithType(const DeviceContext *dev_ctx, +void ConcatDenseTensorWithType(const DeviceContext &dev_ctx, const std::vector &t_list, phi::DenseTensor *p_out, phi::DataType type) { @@ -126,7 +126,7 @@ void ConcatDenseTensorWithType(const DeviceContext *dev_ctx, } template -void SplitDenseTensorWithType(const DeviceContext *dev_ctx, +void SplitDenseTensorWithType(const DeviceContext &dev_ctx, const phi::DenseTensor &t_in, std::vector *p_list, phi::DataType type) { @@ -162,16 +162,16 @@ void SplitDenseTensorWithType(const DeviceContext *dev_ctx, } } -void ConcatTensor(const phi::DeviceContext *dev_ctx, +void ConcatTensor(const phi::DeviceContext &dev_ctx, const std::vector &tensor_list, const experimental::Tensor *tensor) { auto *dense_tensor = std::dynamic_pointer_cast(tensor->impl()).get(); - const auto &place = dev_ctx->GetPlace(); + const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ConcatDenseTensorWithType(static_cast(dev_ctx), + ConcatDenseTensorWithType(static_cast(dev_ctx), tensor_list, dense_tensor, tensor->dtype()); @@ -183,7 +183,7 @@ void ConcatTensor(const phi::DeviceContext *dev_ctx, } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE ConcatDenseTensorWithType( - static_cast(dev_ctx), + static_cast(dev_ctx), tensor_list, dense_tensor, tensor->dtype()); @@ -194,7 +194,7 @@ void ConcatTensor(const phi::DeviceContext *dev_ctx, "CUSTOM_DEVICE support.")); #endif } else if (platform::is_cpu_place(place)) { - ConcatDenseTensorWithType(static_cast(dev_ctx), + ConcatDenseTensorWithType(static_cast(dev_ctx), tensor_list, dense_tensor, tensor->dtype()); @@ -204,20 +204,20 @@ void ConcatTensor(const phi::DeviceContext *dev_ctx, } } -void SplitTensor(const phi::DeviceContext *dev_ctx, +void SplitTensor(const phi::DeviceContext &dev_ctx, const phi::DenseTensor &tensor, const std::vector *tensor_list) { std::vector dense_list; for (auto &tensor : *tensor_list) { - auto p_tensor = + auto *p_tensor = std::dynamic_pointer_cast(tensor.impl()).get(); dense_list.emplace_back(p_tensor); } - const auto &place = dev_ctx->GetPlace(); + const auto &place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - SplitDenseTensorWithType(static_cast(dev_ctx), + SplitDenseTensorWithType(static_cast(dev_ctx), tensor, &dense_list, tensor.dtype()); @@ -229,7 +229,7 @@ void SplitTensor(const phi::DeviceContext *dev_ctx, } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE SplitDenseTensorWithType( - static_cast(dev_ctx), + static_cast(dev_ctx), tensor, &dense_list, tensor.dtype()); @@ -239,7 +239,7 @@ void SplitTensor(const phi::DeviceContext *dev_ctx, "please recompile or reinstall Paddle with CUSTOM_DEVICE support.")); #endif } else if (platform::is_cpu_place(place)) { - SplitDenseTensorWithType(static_cast(dev_ctx), + SplitDenseTensorWithType(static_cast(dev_ctx), tensor, &dense_list, tensor.dtype()); diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 0d46425b2e832..2c26828e5e114 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -16,6 +16,8 @@ #include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/device_manager.h" +DECLARE_bool(use_stream_safe_cuda_allocator); + namespace paddle { namespace distributed { @@ -335,13 +337,20 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { } } -void EagerGroup::SplitTensors(const platform::Place &place) { +void EagerGroup::SplitTensorsDev(const platform::DeviceContext &context) { + auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *default_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); + auto &gpu_context = static_cast(context); SplitTensorsWithType( - *default_ctx, &dense_contents_, &dense_tensors_, dtype_); + gpu_context, &dense_contents_, &dense_tensors_, dtype_); + if (FLAGS_use_stream_safe_cuda_allocator) { + auto dense_tensor = + std::dynamic_pointer_cast(dense_contents_.impl()); + VLOG(3) << "Free dense_contents_ " << dense_contents_.numel(); + memory::RecordStream(dense_tensor->Holder(), gpu_context.stream()); + dense_contents_.reset(); + } #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split grad tensor since it's not compiled with NCCL," @@ -349,10 +358,11 @@ void EagerGroup::SplitTensors(const platform::Place &place) { #endif } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto *default_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); SplitTensorsWithType( - *default_ctx, &dense_contents_, &dense_tensors_, dtype_); + static_cast(context), + &dense_contents_, + &dense_tensors_, + dtype_); #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split grad tensor since it's not compiled with " @@ -360,10 +370,10 @@ void EagerGroup::SplitTensors(const platform::Place &place) { "Please recompile or reinstall Paddle with CUSTOM_DEVICE support.")); #endif } else if (platform::is_cpu_place(place)) { - auto *default_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - SplitTensorsWithType( - *default_ctx, &dense_contents_, &dense_tensors_, dtype_); + SplitTensorsWithType(static_cast(context), + &dense_contents_, + &dense_tensors_, + dtype_); } else { PADDLE_THROW(platform::errors::Unimplemented( "Split grad tensor not supported on place (%s)", place)); @@ -578,9 +588,11 @@ void EagerReducer::TraverseBackwardGraph(const std::vector &outputs) { } } -void EagerReducer::PrepareForBackward(const std::vector &outputs) { +void EagerReducer::PrepareForBackward(const std::vector &outputs, + const bool is_sync) { VLOG(3) << "after forward, then reset count for backward."; - grad_need_hooks_ = true; + grad_need_hooks_ = is_sync; + next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) { group.pending_ = group.tensor_indices_.size(); @@ -648,9 +660,9 @@ void EagerReducer::AddDistHook(size_t var_index) { var_index)); // gradient synchronization is not required when grad_need_hooks_ is false. - if (!grad_need_hooks_) { - return; - } + // if (!grad_need_hooks_) { + // return; + // } VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name() << "@Grad] arrived and triggered disthook"; @@ -816,10 +828,12 @@ void EagerReducer::MarkGroupReady(size_t group_index) { for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; ++next_group_) { UNUSED auto &group = groups_[next_group_]; - if (group.is_sparse_) { - AllReduceSparse(&group, next_group_); - } else { - FusedAllReduceSchedule(&group, next_group_); + if (grad_need_hooks_) { + if (group.is_sparse_) { + AllReduceSparse(&group, next_group_); + } else { + FusedAllReduceSchedule(&group, next_group_); + } } } } @@ -907,16 +921,14 @@ void EagerReducer::ProcessUnusedDenseVars() { void EagerReducer::FinalizeBackward() { groups_need_finalize_ = false; - grad_need_hooks_ = false; for (auto &group : groups_) { - if (!group.is_sparse_) { + if (!group.is_sparse_ && grad_need_hooks_) { group.task->Synchronize(); } } for (auto &group : groups_) { - if (!group.is_sparse_) { - group.SplitTensors(inner_place_); + if (!group.is_sparse_ && grad_need_hooks_) { group.dense_contents_.reset(); } } @@ -928,6 +940,7 @@ void EagerReducer::FinalizeBackward() { VLOG(3) << "ProcessUnusedDenseVars is finished."; } + grad_need_hooks_ = false; VLOG(3) << "In the batch, Reducer is finished."; } @@ -954,6 +967,9 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, } group->task = process_group_->AllReduce(in_out, in_out, opts); + const auto &context = process_group_->GetDeviceContext(inner_place_); + group->SplitTensorsDev(context); + group->task->UpdateWaitChain(context); // split in FinalizeBackward() } diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index 90848920b7e93..74db3db746729 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -74,7 +74,8 @@ class EagerGroup { void ConcatTensors(const platform::Place &); // context is used to select the stream for split - void SplitTensors(const platform::Place &); + + void SplitTensorsDev(const platform::DeviceContext &); friend std::ostream &operator<<(std::ostream &, const EagerGroup &); }; @@ -102,7 +103,8 @@ class EagerReducer { void InitializeGroups(const std::vector> &group_indices); void InitializeDenseGroups(const std::vector &tensor_indices_, EagerGroup *p_group); - void PrepareForBackward(const std::vector &outputs); + void PrepareForBackward(const std::vector &outputs, + const bool is_sync); void AddDistHook(size_t var_index); void MarkVarReady(const size_t var_index, const bool is_used_var); void MarkGroupReady(const size_t group_index); diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index f89fe234c201a..3225222f61737 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -675,9 +675,10 @@ void Reducer::TraverseBackwardGraph( // After each batch is calculated, the counter of each group(group.pending_) // and allreudce sequence counter(next_group_) will be cleaned up again. void Reducer::PrepareForBackward( - const std::vector> &outputs) { + const std::vector> &outputs, + const bool is_sync) { VLOG(3) << "after forward, then reset count for backward."; - grad_need_hooks_ = true; + grad_need_hooks_ = is_sync; next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](Group &group) { group.pending_ = group.variable_indices_.size(); @@ -710,7 +711,9 @@ void Reducer::PrepareForBackward( if (find_unused_vars_once_ || find_unused_vars_each_step_) { unused_vars_.clear(); - TraverseBackwardGraph(outputs); + if (grad_need_hooks_) { + TraverseBackwardGraph(outputs); + } // only check once in first step find_unused_vars_once_ = false; } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index c455f962788b8..902c3036acc78 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -146,7 +146,8 @@ class Reducer { void PrepareDeps(const std::unordered_set& init_nodes); void PrepareForBackward( - const std::vector>& outputs); + const std::vector>& outputs, + const bool is_sync); void AddDistHook(size_t var_index); diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 6aa8e19c99c61..fe1d82c766a0e 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -395,9 +395,10 @@ void BindDistributed(py::module *m) { concat_out_tensor.impl()); std::vector out_wrapper = {*out_dense}; - const auto *dev_ctx = self.GetDeviceContext(in_tensor.place()); + const auto &dev_ctx = self.GetDeviceContext(in_tensor.place()); auto task = self.AllGather(in_wrapper, out_wrapper, sync_op); distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + task->UpdateWaitChain(dev_ctx); return task; }, py::arg("in"), @@ -495,10 +496,11 @@ void BindDistributed(py::module *m) { std::vector out_wrapper = {*out_dense}; // in_tensor_list should not be empty - const auto *dev_ctx = + const auto &dev_ctx = self.GetDeviceContext(in_tensor_list.back().place()); auto task = self.AllToAll(in_wrapper, out_wrapper, sync_op); distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + task->UpdateWaitChain(dev_ctx); return task; }, py::arg("in"), @@ -796,7 +798,7 @@ void BindDistributed(py::module *m) { concat_out_tensor.impl()); std::vector out_wrapper = {*out_dense}; - const auto *dev_ctx = + const auto &dev_ctx = self.GetDeviceContext(in_tensor.place(), true); auto task = self.AllGather(in_wrapper, out_wrapper, @@ -905,7 +907,7 @@ void BindDistributed(py::module *m) { std::vector out_wrapper = {*out_dense}; // in_tensor_list must not be empty - const auto *dev_ctx = self.GetDeviceContext( + const auto &dev_ctx = self.GetDeviceContext( in_tensor_list.back().place(), /*use_calc_stream*/ true); auto task = self.AllToAll(in_wrapper, out_wrapper, @@ -1405,11 +1407,14 @@ void BindDistributed(py::module *m) { .def(py::init(&CreateEagerReducer)) .def( "prepare_for_backward", - [](distributed::EagerReducer &self, py::handle py_tensors) { + [](distributed::EagerReducer &self, + py::handle py_tensors, + bool is_sync) { auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); - self.PrepareForBackward(params); + self.PrepareForBackward(params, is_sync); }, py::arg("tensors"), + py::arg("is_sync"), py::call_guard()); } diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 1eb5f8bd4764c..bd18d4b3319b2 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2569,6 +2569,7 @@ void BindImperative(py::module *m_ptr) { .def("prepare_for_backward", &imperative::Reducer::PrepareForBackward, py::arg("vars"), + py::arg("is_sync"), py::call_guard()); m.def("assign_group_by_size", diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 51e0527e4fa99..004c21c1346b1 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -818,13 +818,9 @@ def forward(self, x): def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) - if ( - self._strategy.nranks > 1 - and framework._dygraph_tracer()._has_grad - and self.grad_need_sync - ): + if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad: self._reducer.prepare_for_backward( - list(self._find_varbase(outputs)) + list(self._find_varbase(outputs)), self.grad_need_sync ) return outputs From e48b6dcfa656c7b40dd4d53c41d494b42fa276fe Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 28 Oct 2022 14:07:15 +0800 Subject: [PATCH 19/91] [JITLayer]Enable OneDNN on CPU and Fix zero shape (#47428) * [JITLayer]Enable OneDNN on CPU and Fix zero shape * remove VLOG --- .../fluid/inference/api/analysis_predictor.cc | 5 +- paddle/fluid/jit/engine/predictor_engine.cc | 68 ++++++++++--------- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index a78a768a7009d..0190d9c291b8e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -171,8 +171,9 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, // NOTE(Aurelius84): Some kernels support zero shape input // without memory holder, we should skip enforce logic. bool has_zero_dim = (phi::product(ddim) == 0); - if (has_zero_dim) { - VLOG(3) << "Found zero dim from input with ddim: " << ddim; + VLOG(3) << "Found zero dim: " << has_zero_dim + << " from input with ddim: " << ddim; + if (!has_zero_dim) { PADDLE_ENFORCE_NOT_NULL( input_ptr, paddle::platform::errors::Fatal( diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc index d6bdf42b041a4..6a44c192c16f7 100644 --- a/paddle/fluid/jit/engine/predictor_engine.cc +++ b/paddle/fluid/jit/engine/predictor_engine.cc @@ -34,12 +34,16 @@ PredictorEngine::PredictorEngine(const std::shared_ptr &info, utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, scope_.get()); VLOG(6) << framework::GenScopeTreeDebugInfo(scope_.get()); + // TODO(Aurelius84): Expose AnalysisConfig to user. AnalysisConfig config; config.SetProgFile(info->ProgramFilePath()); if (platform::is_gpu_place(place_)) { config.EnableUseGpu(100, place_.GetDeviceId()); } else if (platform::is_cpu_place(place_)) { config.DisableGpu(); + config.EnableMKLDNN(); + config.EnableMkldnnInt8(); + config.SetMkldnnCacheCapacity(0); } config.SetSkipLoadParams(true); config.SetApplyOptim(true); @@ -59,10 +63,6 @@ std::vector PredictorEngine::operator()( std::vector PredictorEngine::operator()( const std::vector &inputs) { - for (auto t : inputs) { - VLOG(1) << "inputs is init: " << t.initialized(); - } - std::vector pt_inputs; std::vector pt_outputs; for (auto &t : inputs) { @@ -84,22 +84,23 @@ std::vector PredictorEngine::operator()( static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) { PaddleTensor pt; - - if (framework::TransToProtoVarType(t->dtype()) == - framework::proto::VarType::INT32) { - pt.data.Reset(t->data(), t->numel() * sizeof(int32_t)); - pt.dtype = PaddleDType::INT32; - } else if (framework::TransToProtoVarType(t->dtype()) == - framework::proto::VarType::INT64) { - pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); - pt.dtype = PaddleDType::INT64; - } else if (framework::TransToProtoVarType(t->dtype()) == - framework::proto::VarType::FP32) { - pt.data.Reset(t->data(), t->numel() * sizeof(float)); - pt.dtype = PaddleDType::FLOAT32; - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported tensor date type. Now only supports INT64, FP32, INT32.")); + switch (framework::TransToProtoVarType(t->dtype())) { + case framework::proto::VarType::INT32: { + pt.data.Reset(t->data(), t->numel() * sizeof(int32_t)); + pt.dtype = PaddleDType::INT32; + } break; + case framework::proto::VarType::INT64: { + pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); + pt.dtype = PaddleDType::INT64; + } break; + case framework::proto::VarType::FP32: { + pt.data.Reset(t->data(), t->numel() * sizeof(float)); + pt.dtype = PaddleDType::FLOAT32; + } break; + default: + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported tensor date type. Now " + "only supports INT64, FP32, INT32.")); } pt.shape = phi::vectorize(t->dims()); return pt; @@ -110,17 +111,22 @@ static bool PaddleTensorToDenseTensor(const PaddleTensor &pt, const platform::Place &place) { framework::DDim ddim = phi::make_ddim(pt.shape); void *input_ptr; - if (pt.dtype == PaddleDType::INT64) { - input_ptr = t->mutable_data(ddim, place); - } else if (pt.dtype == PaddleDType::FLOAT32) { - input_ptr = t->mutable_data(ddim, place); - } else if (pt.dtype == PaddleDType::INT32) { - input_ptr = t->mutable_data(ddim, place); - } else if (pt.dtype == PaddleDType::FLOAT16) { - input_ptr = t->mutable_data(ddim, place); - } else { - LOG(ERROR) << "unsupported feed type " << pt.dtype; - return false; + switch (pt.dtype) { + case PaddleDType::INT64: + input_ptr = t->mutable_data(ddim, place); + break; + case PaddleDType::FLOAT32: + input_ptr = t->mutable_data(ddim, place); + break; + case PaddleDType::INT32: + input_ptr = t->mutable_data(ddim, place); + break; + case PaddleDType::FLOAT16: + input_ptr = t->mutable_data(ddim, place); + break; + default: + LOG(ERROR) << "unsupported feed type " << pt.dtype; + return false; } PADDLE_ENFORCE_NOT_NULL( From 0f649b32397dcd043e51b414904ccfa730b52603 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Fri, 28 Oct 2022 14:20:32 +0800 Subject: [PATCH 20/91] remove tcp store barrier (#47184) --- python/paddle/distributed/collective.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 77258f7036cbd..6825dae045f15 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -409,11 +409,8 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): # TODO(shenliang03): This is a temporary solution to solve the problem of # hang caused by tcp paddle.distributed.barrier(group=group) - # NOTE(liyurui): All processors should hang and wait using tcp store, in case master exit before sub-group is created. - if backend != 'heter': - _barrier_by_tcp_store(group_name, _default_store, timeout) - else: - print("Warning: store barrier is not supported for heter backend.") + if paddle.distributed.get_world_size() > 1: + paddle.distributed.barrier() return group if not backend: From 26c419ca386aeae3c461faf2b828d00b48e908eb Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Fri, 28 Oct 2022 18:15:55 +0800 Subject: [PATCH 21/91] [audio]fix audio get_window security error (#47386) * fix window security error * format --- python/paddle/audio/functional/window.py | 44 ++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py index 844e2fc26335f..315d5a50a323f 100644 --- a/python/paddle/audio/functional/window.py +++ b/python/paddle/audio/functional/window.py @@ -19,17 +19,39 @@ from paddle import Tensor +class WindowFunctionRegister(object): + def __init__(self): + self._functions_dict = dict() + + def register(self, func=None): + def add_subfunction(func): + name = func.__name__ + self._functions_dict[name] = func + return func + + return add_subfunction + + def get(self, name): + return self._functions_dict[name] + + +window_function_register = WindowFunctionRegister() + + +@window_function_register.register() def _cat(x: List[Tensor], data_type: str) -> Tensor: l = [paddle.to_tensor(_, data_type) for _ in x] return paddle.concat(l) +@window_function_register.register() def _acosh(x: Union[Tensor, float]) -> Tensor: if isinstance(x, float): return math.log(x + math.sqrt(x**2 - 1)) return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) +@window_function_register.register() def _extend(M: int, sym: bool) -> bool: """Extend window by 1 sample if needed for DFT-even symmetry.""" if not sym: @@ -38,6 +60,7 @@ def _extend(M: int, sym: bool) -> bool: return M, False +@window_function_register.register() def _len_guards(M: int) -> bool: """Handle small or incorrect window lengths.""" if int(M) != M or M < 0: @@ -46,6 +69,7 @@ def _len_guards(M: int) -> bool: return M <= 1 +@window_function_register.register() def _truncate(w: Tensor, needed: bool) -> Tensor: """Truncate window by 1 sample if needed for DFT-even symmetry.""" if needed: @@ -54,6 +78,7 @@ def _truncate(w: Tensor, needed: bool) -> Tensor: return w +@window_function_register.register() def _general_gaussian( M: int, p, sig, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -70,6 +95,7 @@ def _general_gaussian( return _truncate(w, needs_trunc) +@window_function_register.register() def _general_cosine( M: int, a: float, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -86,6 +112,7 @@ def _general_cosine( return _truncate(w, needs_trunc) +@window_function_register.register() def _general_hamming( M: int, alpha: float, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -95,6 +122,7 @@ def _general_hamming( return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype) +@window_function_register.register() def _taylor( M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -151,6 +179,7 @@ def W(n): return _truncate(w, needs_trunc) +@window_function_register.register() def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: """Compute a Hamming window. The Hamming window is a taper formed by using a raised cosine with @@ -159,6 +188,7 @@ def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: return _general_hamming(M, 0.54, sym, dtype=dtype) +@window_function_register.register() def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: """Compute a Hann window. The Hann window is a taper formed by using a raised cosine or sine-squared @@ -167,6 +197,7 @@ def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: return _general_hamming(M, 0.5, sym, dtype=dtype) +@window_function_register.register() def _tukey( M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -200,6 +231,7 @@ def _tukey( return _truncate(w, needs_trunc) +@window_function_register.register() def _kaiser( M: int, beta: float, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -209,6 +241,7 @@ def _kaiser( raise NotImplementedError() +@window_function_register.register() def _gaussian( M: int, std: float, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -226,6 +259,7 @@ def _gaussian( return _truncate(w, needs_trunc) +@window_function_register.register() def _exponential( M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64' ) -> Tensor: @@ -245,6 +279,7 @@ def _exponential( return _truncate(w, needs_trunc) +@window_function_register.register() def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: """Compute a triangular window.""" if _len_guards(M): @@ -262,6 +297,7 @@ def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: return _truncate(w, needs_trunc) +@window_function_register.register() def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: """Compute a Bohman window. The Bohman window is the autocorrelation of a cosine window. @@ -279,6 +315,7 @@ def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: return _truncate(w, needs_trunc) +@window_function_register.register() def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: """Compute a Blackman window. The Blackman window is a taper formed by using the first three terms of @@ -289,6 +326,7 @@ def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) +@window_function_register.register() def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor: """Compute a window with a simple cosine shape.""" if _len_guards(M): @@ -308,7 +346,7 @@ def get_window( """Return a window of a given length and type. Args: - window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. win_length (int): Number of samples. fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. dtype (str, optional): The data type of the return window. Defaults to 'float64'. @@ -348,8 +386,8 @@ def get_window( ) try: - winfunc = eval('_' + winstr) - except NameError as e: + winfunc = window_function_register.get('_' + winstr) + except KeyError as e: raise ValueError("Unknown window type.") from e params = (win_length,) + args From 315ef26505c4c1b5bf534b5ff9c8643fcce57f54 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Fri, 28 Oct 2022 20:08:03 +0800 Subject: [PATCH 22/91] [AutoParallel] fix engine _build and cost method (#47263) * fix engine build method * fix import * update engine cost * update raise error * update cmakelist * revert optimizer * revert optimizer * fix unittest * fix unittest Co-authored-by: caozhou --- .../auto_parallel/cost/comp_op_cost.py | 38 ++ .../auto_parallel/cost/estimate_cost.py | 28 +- .../distributed/auto_parallel/engine.py | 338 +++++++++--------- .../paddle/distributed/auto_parallel/utils.py | 31 ++ .../unittests/auto_parallel/CMakeLists.txt | 22 +- .../unittests/auto_parallel/engine_api.py | 54 ++- .../auto_parallel/test_engine_api_error.py | 311 ++++++++++++++++ 7 files changed, 634 insertions(+), 188 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py index 51c3d6069a691..293c9ae3a58a1 100644 --- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py @@ -34,6 +34,25 @@ def calc_time(self): return 0 +@register_op_cost +class ArgsortOpCost(CompOpCost): + OP_TYPE = "argsort" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(ArgsortOpCost, self).__init__( + op=op, op_desc=op_desc, cluster=cluster + ) + + # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided + def calc_flops(self): + # NOTE: The actual formula will be filled in the future + return 0 + + def calc_time(self): + # NOTE: The actual formula will be filled in the future + return 0 + + @register_op_cost class AssignOpCost(CompOpCost): OP_TYPE = "assign" @@ -338,6 +357,25 @@ def calc_time(self): return 0 +@register_op_cost +class EqualOpCost(CompOpCost): + OP_TYPE = "equal" + + def __init__(self, op=None, op_desc=None, cluster=None): + super(EqualOpCost, self).__init__( + op=op, op_desc=op_desc, cluster=cluster + ) + + # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided + def calc_flops(self): + # NOTE: The actual formula will be filled in the future + return 0 + + def calc_time(self): + # NOTE: The actual formula will be filled in the future + return 0 + + @register_op_cost class EmbeddingOpCost(CompOpCost): OP_TYPE = "c_embedding" diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py index cac8bf9f277e4..5808b706fce03 100644 --- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py @@ -545,11 +545,12 @@ def pretty_print_cost(self): def get_cost_from_engine(engine, mode): from ..utils import to_list + import copy # Construct cost estimator by original main program serial_main_prog = ( - engine._serial_main_progs[mode].clone() - if mode in engine._serial_main_progs + engine._fwd_main_progs[mode].clone() + if mode in engine._fwd_main_progs else engine._orig_main_prog.clone() ) @@ -566,29 +567,29 @@ def get_cost_from_engine(engine, mode): ) else engine._losses ) - - if mode in engine._dist_contexts: - dist_context = engine._dist_contexts[mode] - completer = engine._planners[mode].completer + serial_optimizer = copy.deepcopy(engine._orig_optimizer) + if mode in engine._fwd_dist_contexts: + dist_context = copy.deepcopy(engine._fwd_dist_contexts[mode]) else: - from ..completion import Completer from ..dist_context import DistributedContext dist_context = DistributedContext( serial_main_prog, serial_startup_prog, - engine._optimizer, + serial_optimizer, losses, {}, {"loss": losses}, engine._cluster, engine._strategy, ) - completer = Completer(dist_context) - completer.complete_forward_annotation() - dist_context.block_state.parse_forward_blocks( - dist_context.serial_main_program - ) + from ..completion import Completer + + completer = Completer(dist_context) + completer.complete_forward_annotation() + dist_context.block_state.parse_forward_blocks( + dist_context.serial_main_program + ) if mode == "eval" or mode == "predict": cost_estimator = CostEstimator(serial_main_prog, engine._cluster) @@ -596,7 +597,6 @@ def get_cost_from_engine(engine, mode): from ..parallelizer_v2 import Parallelizer # Get serial main program with backward - serial_optimizer = engine._optimizer parallelizer = Parallelizer(mode, completer, dist_context) # Generate backward loss_name = dist_context.serial_loss.name diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 1581beb2a511c..28e9fc69d7d59 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -13,8 +13,10 @@ # limitations under the License. import os +import copy import logging import random +import numbers import numpy as np from collections import defaultdict @@ -45,15 +47,16 @@ DistributedDataLoaderFromGenerator, DistributedDataLoader, ) -from .utils import to_list, get_dist_attr, get_lr from .process_group import new_process_group, get_all_process_groups from .dist_context import DistributedContext, get_default_distributed_context from .strategy import Strategy from .interface import CollectionNames, get_collection -from ..utils.log_utils import get_logger -from .utils import initialize_pg_in_full_mode +from .utils import to_list, get_dist_attr, get_lr, validate_opt +from .utils import initialize_pg_in_full_mode, get_input_split_info from .cost.estimate_cost import get_cost_from_engine +from ..utils.log_utils import get_logger + class Engine: """ @@ -137,6 +140,15 @@ def __init__( "'model must be sub classes of `paddle.nn.Layer` or any callable function." ) self._model = model + + if ( + loss + and not isinstance(loss, (paddle.nn.Layer, Variable)) + and not callable(loss) + ): + raise TypeError( + "'loss' must be sub classes of `paddle.nn.Layer` or any callable function or a Variable." + ) self._loss = loss if optimizer and not isinstance( @@ -147,13 +159,17 @@ def __init__( "'optimizer' must be object of class `paddle.optimizer.Optimizer`" " or `paddle.fluid.optimizer.Optimizer`." ) - self._optimizer = self._validate_opt(optimizer) + self._optimizer = validate_opt(optimizer) + self._orig_optimizer = copy.deepcopy(self._optimizer) metrics = metrics or [] for metric in to_list(metrics): - assert isinstance( - metric, Metric - ), "{} is not sub class of Metric".format(metric.__class__.__name__) + if metric and not isinstance(metric, Metric): + raise TypeError( + "{} is not sub class of Metric".format( + metric.__class__.__name__ + ) + ) self._metrics = to_list(metrics) if cluster and not isinstance(cluster, Cluster): @@ -168,9 +184,10 @@ def __init__( ) self._strategy = strategy or Strategy() + self._logger = get_logger(logging.INFO) if os.getenv("POD_NAME"): - print( - "Distribute training by paddle.distributed.launch", flush=True + self._logger.info( + "Distribute training by paddle.distributed.launch" ) fleet.init(is_collective=True) @@ -179,12 +196,12 @@ def __init__( self._nranks = paddle.distributed.get_world_size() self._saver = DistributedSaver() - self._logger = get_logger(logging.INFO) - self._orig_main_prog = static.default_main_program() self._orig_startup_prog = static.default_startup_program() self._orig_dist_context = get_default_distributed_context() self._dist_contexts = {} + self._fwd_main_progs = {} + self._fwd_dist_contexts = {} self._serial_main_progs = {} self._serial_startup_progs = {} self._dist_main_progs = defaultdict(dict) # dist main programs @@ -202,13 +219,14 @@ def __init__( self._labels_spec = [] self._inputs = [] self._labels = [] + self._losses = [] + self._mode = None self._skip_build = False self._outside_dataloader = False self._planned_mode = None self._dygraph_mode = False self._tuning = self._strategy.tuning - self._losses = None self.history = None @@ -230,7 +248,7 @@ def _prepare_data_spec(self, data, split, batch_size): inputs = sample[:split] labels = sample[split:] else: - raise ValueError( + raise TypeError( "Data should be a Dataset or IterableDatset, but received {}.".format( type(data).__name__ ) @@ -259,8 +277,14 @@ def _infer_item_spec(item, name, batch_size, specs): specs.append(spec) else: specs.append(spec.batch(batch_size)) - else: + elif isinstance(item, numbers.Number): specs.append(InputSpec([batch_size], type(item), name)) + else: + raise TypeError( + "The sample's dtype returned of dataset should be number, np.ndarray or Tensor, but got {}".format( + type(item).__name__ + ) + ) if inputs is not None: for i, item in enumerate(inputs): @@ -277,43 +301,41 @@ def _infer_item_spec(item, name, batch_size, specs): labels_spec = self._validate_spec(labels_spec) return inputs_spec, labels_spec - def _prepare_data_tensor( - self, inputs_spec, labels_spec, inputs=None, labels=None - ): + def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels): if _non_static_mode() or self._dygraph_mode: - return None, None - inputs_spec = inputs_spec if inputs_spec else [] - labels_spec = labels_spec if labels_spec else [] + raise ValueError("Only support static graph mode.") + if inputs_spec: assert isinstance( inputs_spec, list ), "inputs should be list, but received {}".format( type(inputs_spec) ) - if inputs is None: - inputs = [s._create_feed_layer() for s in inputs_spec] - else: - assert isinstance( - inputs, list - ), "inputs should be list, but received {}".format(type(inputs)) - for input_spec, input in zip(inputs_spec, inputs): - if input_spec.shape != input.shape: - input.desc.set_shape(input_spec.shape) + assert isinstance( + inputs, list + ), "inputs should be list, but received {}".format(type(inputs)) + assert len(inputs_spec) == len( + inputs + ), "the number of `inputs_spec` should be equal to `inputs`'s." + for input_spec, input in zip(inputs_spec, inputs): + if input_spec.shape != input.shape: + input.desc.set_shape(input_spec.shape) if labels_spec: assert isinstance( labels_spec, list ), "labels should be list, but received {}".format( type(labels_spec) ) - if labels is None: - labels = [s._create_feed_layer() for s in labels_spec] - else: - assert isinstance( - labels, list - ), "labels should be list, but received {}".format(type(labels)) - for label_spec, label in zip(labels_spec, labels): - if label_spec.shape != label.shape: - label.desc.set_shape(label_spec.shape) + assert isinstance( + labels, list + ), "labels should be list, but received {}".format(type(labels)) + assert len(labels_spec) == len( + labels + ), "the number of `labels_spec` should be equal to `labels`'s." + for label_spec, label in zip(labels_spec, labels): + if label_spec.shape != label.shape: + label.desc.set_shape(label_spec.shape) + return inputs, labels def _prepare_reader(self): @@ -497,10 +519,12 @@ def _build(self, mode): self._dygraph_mode = True self._logger.info("Building model with 'to_static' method.") - inputs_spec = self._inputs_spec - labels_spec = self._labels_spec if self._labels_spec else [] self.program_helper = ProgramHelper( - self._model, self._loss, self._metrics, inputs_spec, labels_spec + self._model, + self._loss, + self._metrics, + self._inputs_spec, + self._labels_spec, ) # build forward main program self.program_helper.build_program(mode) @@ -509,16 +533,12 @@ def _build(self, mode): serial_main_prog = self.program_helper.main_program serial_startup_prog = self.program_helper.startup_program - inputs = self.program_helper.input_vars + self._inputs = self.program_helper.input_vars + self._labels = self.program_helper.label_vars outputs = self.program_helper.output_vars - labels = self.program_helper.label_vars - losses = self.program_helper.loss_vars - self._losses = losses + self._losses = self.program_helper.loss_vars metrics = self.program_helper.metric_vars - self._inputs = inputs - self._labels = labels - paddle.enable_static() else: # build program in static mode @@ -527,29 +547,45 @@ def _build(self, mode): return outputs = [] - losses = [] metrics = [] - inputs = self._inputs if self._inputs else [] - labels = self._labels if self._labels else [] + self._losses = [] serial_main_prog = self._orig_main_prog.clone() serial_startup_prog = self._orig_startup_prog.clone() if not self._skip_build: with static.program_guard( serial_main_prog, serial_startup_prog ), utils.unique_name.guard(): - outputs = to_list(self._model(*inputs)) + self._inputs = [ + s._create_feed_layer() for s in self._inputs_spec + ] + self._labels = [ + s._create_feed_layer() for s in self._labels_spec + ] + + outputs = to_list(self._model(*self._inputs)) + if mode != "predict" and self._loss: - losses = to_list(self._loss(*(outputs + labels))) - self._losses = losses + assert isinstance( + self._loss, paddle.nn.Layer + ) or callable( + self._loss + ), "the type of `loss` of the Engine arguments should be sub classes of `paddle.nn.Layer` or any callable function." + self._losses = to_list( + self._loss(*(outputs + self._labels)) + ) - if mode != "predict" and (outputs or labels): + if mode != "predict" and (outputs or self._labels): for metric in self._metrics: metrics.append( - to_list(metric.compute(*(outputs + labels))) + to_list( + metric.compute(*(outputs + self._labels)) + ) ) else: - losses = to_list(self._loss) - self.losses = losses + assert isinstance( + self._loss, Variable + ), "the type of `loss` of the Engine arguments should be Variable." + self._losses = to_list(self._loss) default_ctx = get_default_distributed_context() if not default_ctx.has_annotation: @@ -558,11 +594,11 @@ def _build(self, mode): new_process_group(list(range(self._nranks))) default_ctx.data_parallel = True - feed_vars = {"inputs": inputs, "labels": labels} + feed_vars = {"inputs": self._inputs, "labels": self._labels} fetch_vars = { "outputs": flatten(outputs), - "loss": losses, + "loss": self._losses, "metrics": metrics, } @@ -574,13 +610,24 @@ def _build(self, mode): serial_main_prog, serial_startup_prog, self._optimizer, - losses, + self._losses, + feed_vars, + fetch_vars, + self._cluster, + self._strategy, + ) + self._fwd_dist_contexts[mode] = DistributedContext( + serial_main_prog, + serial_startup_prog, + self._optimizer, + self._losses, feed_vars, fetch_vars, self._cluster, self._strategy, ) self._dist_contexts[mode].gradient_scale = self._strategy.gradient_scale + self._fwd_main_progs[mode] = serial_main_prog.clone() def _optimization_tuning(self, mode, dataset, batch_size): if not self._tuning.enable: @@ -637,8 +684,8 @@ def _plan(self, mode): self._dp_world_sizes = [] self._dp_ranks = [] for feed_var in feed_list: - dp_world_size, dp_rank = self._get_input_split_info( - feed_var, self._dist_contexts[mode] + dp_world_size, dp_rank = get_input_split_info( + self._cur_rank, feed_var, self._dist_contexts[mode] ) self._dp_world_sizes.append(dp_world_size) self._dp_ranks.append(dp_rank) @@ -834,18 +881,11 @@ def fit( self._inputs_spec, self._labels_spec = self._prepare_data_spec( train_data, train_sample_split, batch_size ) - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) if not self._has_prepared[self._mode]: self._prepare_program(self._mode) else: self._switch_mode(self._mode) - assert ( - self._mode in self._dist_main_progs - ), "train model is not ready, please call `engine._prepare_program('train')` first." - train_dataloader = self._prepare_dataloader_from_generator( dataset=train_data, capacity=70, @@ -984,17 +1024,11 @@ def evaluate( self._inputs_spec, self._labels_spec = self._prepare_data_spec( valid_data, valid_sample_split, batch_size ) - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) if not self._has_prepared[self._mode]: self._prepare_program(self._mode) else: self._switch_mode(self._mode) - assert ( - self._mode in self._dist_main_progs - ), "eval model is not ready, please call `engine._prepare_program('eval')` first." valid_dataloader = self._prepare_dataloader_from_generator( dataset=valid_data, capacity=70, @@ -1096,18 +1130,11 @@ def predict( self._inputs_spec, self._labels_spec = self._prepare_data_spec( test_data, test_sample_split, batch_size ) - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) if not self._has_prepared[self._mode]: self._prepare_program(self._mode) else: self._switch_mode(self._mode) - assert ( - self._mode in self._dist_main_progs - ), "predict model is not ready, please call `engine._prepare_program('predict')` first." - test_dataloader = self._prepare_dataloader_from_generator( dataset=test_data, capacity=70, @@ -1165,13 +1192,11 @@ def dataloader( self._inputs_spec, self._labels_spec = self._prepare_data_spec( dataset, sample_split, batch_size ) - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) if not self._has_prepared[self._mode]: self._prepare_program(self._mode) else: self._switch_mode(self._mode) + dataloader = self._prepare_dataloader( dataset, return_list=False, @@ -1209,13 +1234,11 @@ def dataloader_from_generator( self._inputs_spec, self._labels_spec = self._prepare_data_spec( dataset, sample_split, batch_size ) - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) if not self._has_prepared[self._mode]: self._prepare_program(self._mode) else: self._switch_mode(self._mode) + dataloader = self._prepare_dataloader_from_generator( dataset=dataset, capacity=capacity, @@ -1243,45 +1266,49 @@ def prepare( ): if mode is not None: self.to_mode(mode) + + if not self._mode: + raise ValueError( + "Please set mode to be prepared with `prepare(mode=...)`" + ) + + if self._has_prepared[self._mode]: + return + + inputs_spec = self._validate_spec(inputs_spec) + labels_spec = self._validate_spec(labels_spec) + inputs = self._validate_vars(inputs) + labels = self._validate_vars(labels) + + self._orig_main_prog = main_program + self._orig_startup_prog = startup_program if inputs or labels: self._skip_build = True - self._inputs_spec = inputs_spec - self._labels_spec = labels_spec - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec, inputs, labels + inputs, labels = self._prepare_data_tensor( + inputs_spec, labels_spec, inputs, labels ) - self._orig_main_prog = main_program if self._orig_main_prog is None: self._orig_main_prog = static.default_main_program() - self._orig_startup_prog = startup_program if self._orig_startup_prog is None: self._orig_startup_prog = static.default_startup_program() - if not self._has_prepared[self._mode]: - self._prepare_program(self._mode) - else: - self._switch_mode(self._mode) elif inputs_spec or labels_spec: - self._inputs_spec = inputs_spec - self._labels_spec = labels_spec self._outside_dataloader = True - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) - self._orig_main_prog = main_program if self._orig_main_prog is None: self._orig_main_prog = static.default_main_program() - self._orig_startup_prog = startup_program if self._orig_startup_prog is None: self._orig_startup_prog = static.default_startup_program() - if not self._has_prepared[self._mode]: - self._prepare_program(self._mode) - else: - self._switch_mode(self._mode) else: assert ( self._inputs_spec and self._labels_spec ), "Please call the dataloader(...) before calling prepare(...)" + self._inputs_spec, self._labels_spec = inputs_spec, labels_spec + self._inputs, self._labels = inputs, labels + if not self._has_prepared[self._mode]: + self._prepare_program(self._mode) + else: + self._switch_mode(self._mode) + def run(self, data=None, feed=None, fetch_list=None, mode=None): if mode is not None: self.to_mode(mode) @@ -1331,7 +1358,6 @@ def _prepare_dataloader( dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank] dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank] - dist_context = self._dist_contexts[self._mode] dist_main_block = dist_main_prog.global_block() # NOTE: Get feed_list, then insert dataloader op with sharded var shape. @@ -1400,7 +1426,6 @@ def _prepare_dataloader_from_generator( dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank] dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank] - dist_context = self._dist_contexts[self._mode] dist_main_block = dist_main_prog.global_block() # NOTE: Get feed_list, then insert dataloader op with sharded var shape. @@ -1446,9 +1471,6 @@ def _tune(self, tune_data, tune_sample_split=None, batch_size=1): self._inputs_spec, self._labels_spec = self._prepare_data_spec( tune_data, tune_sample_split, batch_size ) - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) self._optimization_tuning(self._mode, tune_data, batch_size) def _validate_spec(self, specs): @@ -1456,7 +1478,10 @@ def _validate_spec(self, specs): self._k_steps = self._strategy.gradient_merge.k_steps if specs is not None: for i, spec in enumerate(specs): - assert isinstance(spec, InputSpec) + if not isinstance(spec, InputSpec): + raise TypeError( + "'spec' must be object of class `paddle.static.InputSpec`." + ) if spec.name is None: raise ValueError( "Requires Input[{}].name != None, but receive `None` with {}.".format( @@ -1472,39 +1497,20 @@ def _validate_spec(self, specs): ) shape[0] //= self._k_steps spec.shape = shape - return specs + return specs or [] + + def _validate_vars(self, vars): + vars = to_list(vars) + if vars is not None: + for i, var in enumerate(vars): + if not isinstance(var, Variable): + raise TypeError("'var' must be a `Variable`.") + return vars or [] def _is_local_var(self, var): var_name = _to_name_str(var) return var_name in self.main_program.global_block().vars - def _get_input_split_info(self, var, dist_context): - # deduce how the input data is split among the cluster - from .utils import _get_comm_group, _get_corresponding_rank - - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(var) - process_mesh = tensor_dist_attr.process_mesh - dims_mapping = tensor_dist_attr.dims_mapping - - if self._cur_rank not in process_mesh.processes: - rank_id = _get_corresponding_rank( - dist_context, process_mesh, self._cur_rank - ) - else: - rank_id = self._cur_rank - - batch_size_axis = dims_mapping[0] - if batch_size_axis > -1 and process_mesh.topology[batch_size_axis] > 1: - group_ranks = _get_comm_group( - process_mesh.processes, - process_mesh.topology, - batch_size_axis, - rank_id, - ) - return len(group_ranks), group_ranks.index(rank_id) - - return 1, 0 - def _set_recompute_ckpts(self): # NOTE hack to enable recompute in engine api for GPT-3 # TODO support more PaddleNLP/CV models here @@ -1534,12 +1540,6 @@ def _set_recompute_ckpts(self): } self._logger.info(logs) - def _validate_opt(self, optimizer): - if optimizer is not None: - optimizer._parameter_list = None - optimizer._param_groups = None - return optimizer - def _reset_metrics(self): for metric in self._metrics: metric.reset() @@ -1551,6 +1551,9 @@ def _metrics_name(self): return metrics_name def _switch_mode(self, mode): + assert ( + mode in self._dist_main_progs + ), "{} model is not ready, please call `prepare()` first.".format(mode) self.to_mode(mode) self._optimizer = self._dist_contexts[mode]._serial_optimizer @@ -1691,7 +1694,7 @@ def load(self, path, strict=True, load_optimizer=True): ) return self._state_dict, self._dist_attr - def cost(self, inputs_spec=None, labels_spec=None, mode="train"): + def cost(self, inputs_spec=None, labels_spec=None, mode=None): """ Get and Print cost, including memory of every rank, max memory among all ranks, and the global cost of one step based on @@ -1702,7 +1705,7 @@ def cost(self, inputs_spec=None, labels_spec=None, mode="train"): Args: inputs_spec(InputSpec): The specification of inputs. Default: None. labels_spec(InputSpec): The specification of labels. Default: None. - mode (str): The engine mode must be in ["train", "predict", "eval"]. Default: "train". + mode (str): The engine mode must be in ["train", "predict", "eval"]. Default: None. Returns: Return the global execution time (ms) and max memory (B). @@ -1710,33 +1713,44 @@ def cost(self, inputs_spec=None, labels_spec=None, mode="train"): """ # Check parallel mode if self._strategy.auto_mode == "full": - print( + self._logger.info( "The cost will be calcudated in the search process when the auto mode is full." ) return # Check mode - accepted_modes = ["train", "predict", "eval"] - if mode not in accepted_modes: + mode = mode if mode is not None else self._mode + assert mode is not None, "Please set mode." + if mode not in self._has_prepared: raise ValueError( "The mode {} is not in accepted modes {}".format( - mode, accepted_modes + mode, list(self._has_prepared.keys()) ) ) self.to_mode(mode) - if inputs_spec is not None: - self._inputs_spec, self._labels_spec = inputs_spec, labels_spec - self._inputs, self._labels = self._prepare_data_tensor( - self._inputs_spec, self._labels_spec - ) + if inputs_spec is not None and not self._has_prepared[mode]: + self._inputs_spec = self._validate_spec(inputs_spec) + self._labels_spec = self._validate_spec(labels_spec) self._build(mode) self._plan(mode) else: if _non_static_mode() or self._dygraph_mode: raise ValueError( - "Please call `engine._prepare_program('mode')` firstly when in the static graph mode." + "Please call `prepare()` or `fit()` or `evaluate()` or `predict()` before calling `cost()`." + ) + else: + self._logger.info( + "The program whose cost to be estimated must be static default program. Otherwise, please call `prepare()`before calling `cost()`." ) + program = paddle.static.default_main_program() + if ( + not program.global_block().ops + or not program.global_block().ops + ) and not self._has_prepared[mode]: + raise ValueError( + "Please call `prepare()` or `fit()` or `evaluate()` or `predict()` before calling `cost()`." + ) # Estimate the exec cost and max memory global_cost, max_memory = get_cost_from_engine(self, mode) diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 8b7ec647f6e76..73d1c1412d1e3 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1876,3 +1876,34 @@ def initialize_pg_in_full_mode(all_process_groups, cur_rank): break process_group.instantiate() server_socket.close() + + +def get_input_split_info(cur_rank, var, dist_context): + # deduce how the input data is split among the cluster + tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(var) + process_mesh = tensor_dist_attr.process_mesh + dims_mapping = tensor_dist_attr.dims_mapping + + if cur_rank not in process_mesh.processes: + rank_id = _get_corresponding_rank(dist_context, process_mesh, cur_rank) + else: + rank_id = cur_rank + + batch_size_axis = dims_mapping[0] + if batch_size_axis > -1 and process_mesh.topology[batch_size_axis] > 1: + group_ranks = _get_comm_group( + process_mesh.processes, + process_mesh.topology, + batch_size_axis, + rank_id, + ) + return len(group_ranks), group_ranks.index(rank_id) + + return 1, 0 + + +def validate_opt(optimizer): + if optimizer is not None: + optimizer._parameter_list = None + optimizer._param_groups = None + return optimizer diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 3d34ed4fcdbe7..bd6ccfd3922c8 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -63,6 +63,15 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_engine_callbacks MODULES test_engine_callbacks) set_tests_properties(test_engine_callbacks PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + py_test_modules(test_parallel_tuner MODULES test_parallel_tuner ENVS + ${dist_ENVS}) + set_tests_properties(test_parallel_tuner PROPERTIES TIMEOUT 120) + py_test_modules(test_parallel_tuner_full MODULES test_parallel_tuner_full + ENVS ${dist_ENVS}) + set_tests_properties(test_parallel_tuner_full PROPERTIES TIMEOUT 120) + py_test_modules(test_parallel_tuner_predict MODULES + test_parallel_tuner_predict ENVS ${dist_ENVS}) + set_tests_properties(test_parallel_tuner_predict PROPERTIES TIMEOUT 120) py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) @@ -90,6 +99,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_prim_dist_op MODULES test_prim_dist_op ENVS ${dist_ENVS}) py_test_modules(test_to_static MODULES test_to_static ENVS ${dist_ENVS}) py_test_modules(test_dist_op_cost MODULES test_dist_op_cost ENVS ${dist_ENVS}) + py_test_modules(test_cluster_v2 MODULES test_cluster_v2) py_test_modules(test_process_mesh_v2 MODULES test_process_mesh_v2) py_test_modules(test_dist_attr_v2 MODULES test_dist_attr_v2) @@ -99,20 +109,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_interface MODULES test_interface) py_test_modules(test_strategy MODULES test_strategy) py_test_modules(test_pass_quantization MODULES test_pass_quantization) - py_test_modules(test_dist_shape MODULES test_dist_shape) py_test_modules(test_dist_assign MODULES test_dist_assign) py_test_modules(test_conditional_block_reshard MODULES test_conditional_block_reshard) - - py_test_modules(test_parallel_tuner MODULES test_parallel_tuner ENVS - ${dist_ENVS}) - set_tests_properties(test_parallel_tuner PROPERTIES TIMEOUT 120) - py_test_modules(test_parallel_tuner_full MODULES test_parallel_tuner_full - ENVS ${dist_ENVS}) - set_tests_properties(test_parallel_tuner_full PROPERTIES TIMEOUT 120) - py_test_modules(test_parallel_tuner_predict MODULES - test_parallel_tuner_predict ENVS ${dist_ENVS}) - set_tests_properties(test_parallel_tuner_predict PROPERTIES TIMEOUT 120) + py_test_modules(test_engine_api_error MODULES test_engine_api_error) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py index c09edf0442566..291792e359b49 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py @@ -374,6 +374,57 @@ def train_non_builtin_data_vars(): def get_cost(): + main_program = static.Program() + startup_program = static.Program() + with static.program_guard( + main_program, startup_program + ), utils.unique_name.guard(): + input = static.data( + name="input", shape=[batch_size, image_size], dtype='float32' + ) + label = static.data(name="label", shape=[batch_size, 1], dtype='int64') + + loader = paddle.io.DataLoader.from_generator( + feed_list=[input, label], capacity=4 * batch_size, iterable=False + ) + places = static.cuda_places() + loader.set_batch_generator(batch_generator_creator(), places=places) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.Adam( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None, + ) + metric = paddle.metric.Accuracy() + predict = mlp(input) + loss_var = loss(predict, label) + + strategy = auto.Strategy() + strategy.auto_mode = "semi" + + engine = auto.Engine( + loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy + ) + engine.prepare( + main_program=main_program, + startup_program=startup_program, + inputs=[input], + labels=[label], + mode="train", + ) + engine.cost() + + +def get_cost_by_default_program(): main_program = static.default_main_program() startup_program = static.default_startup_program() with static.program_guard( @@ -414,7 +465,7 @@ def get_cost(): engine = auto.Engine( loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy ) - engine.cost() + engine.cost(mode="train") def get_cost_by_spec(): @@ -451,4 +502,5 @@ def get_cost_by_spec(): train_builtin_data_vars() train_non_builtin_data_vars() get_cost() + get_cost_by_default_program() get_cost_by_spec() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py new file mode 100644 index 0000000000000..cd825524b8ae3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py @@ -0,0 +1,311 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.static as static +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.io import Dataset + +from paddle.distributed.fleet import auto + +paddle.enable_static() + + +epoch_num = 1 +batch_size = 2 +batch_num = 10 +hidden_size = 1024 +sequence_len = 512 +image_size = hidden_size +class_num = 10 + +is_fetch = True +is_feed = True +my_feed_vars = [] + + +class TrainDataset(Dataset): + def __init__(self, num_samples): + super(TrainDataset, self).__init__() + self.num_samples = num_samples + + def __getitem__(self, index): + input = np.random.uniform(size=image_size).astype("float32") + label = np.random.randint(0, class_num - 1, dtype="int64") + return input, label + + def __len__(self): + return self.num_samples + + +class TestDataset(Dataset): + def __init__(self, num_samples): + super(TestDataset, self).__init__() + self.num_samples = num_samples + + def __getitem__(self, index): + input = np.random.uniform(size=image_size).astype("float32") + return input + + def __len__(self): + return self.num_samples + + +class MLPLayer(nn.Layer): + def __init__( + self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02, + ): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=initializer_range) + ) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr + ) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr + ) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") + + def forward(self, input): + out = self.norm(input) + out = self.linear0(out) + + if is_feed: + my_feed_vars.append((out, out.shape)) + + out = F.gelu(out, approximate=True) + out = self.linear1(out) + out = self.dropout(out) + out = self.linear2(out) + + if is_feed: + my_feed_vars.append((out, out.shape)) + if is_fetch: + auto.fetch(out, "my_fetch", logging=True) + return out + + +class TestEngineErrorRaise(unittest.TestCase): + def setUp(self): + class NoSupportData1: + def __getitem__(self, index): + input = np.random.uniform(size=image_size).astype("float32") + label = np.random.randint(0, class_num - 1, dtype="int64") + return input, label + + class NoSupportData2(TrainDataset): + def __getitem__(self, index): + input = [ + list(np.random.uniform(size=image_size).astype("float32")) + ] + label = [np.random.randint(0, class_num - 1, dtype="int64")] + return input, label + + class NoSupportData3: + def __getitem__(self, index): + input = np.random.uniform(size=image_size).astype("float32") + return input + + class NoSupportData4(TestDataset): + def __getitem__(self, index): + input = [ + list(np.random.uniform(size=image_size).astype("float32")) + ] + return input + + self.no_support_data_1 = NoSupportData1() + self.no_support_data_2 = NoSupportData2(10) + self.no_support_data_3 = NoSupportData3() + self.no_support_data_4 = NoSupportData4(10) + + def test_Engine(self): + with self.assertRaises(TypeError): + auto.Engine(model=paddle.static.Program()) + with self.assertRaises(TypeError): + auto.Engine(loss="CrossEntropyLoss") + with self.assertRaises(TypeError): + auto.Engine(optimizer="adam") + with self.assertRaises(TypeError): + auto.Engine(metrics=["acc"]) + with self.assertRaises(TypeError): + auto.Engine(cluster="cluster") + with self.assertRaises(TypeError): + auto.Engine(strategy="strategy") + + def test_fit(self): + + with self.assertRaises(TypeError): + + engine = auto.Engine( + model=MLPLayer(), + loss=paddle.nn.CrossEntropyLoss(), + optimizer=paddle.optimizer.AdamW(0.00001), + ) + engine.fit(train_data=self.no_support_data_1) + + with self.assertRaises(TypeError): + + engine = auto.Engine( + model=MLPLayer(), + loss=paddle.nn.CrossEntropyLoss(), + optimizer=paddle.optimizer.AdamW(0.00001), + ) + engine.fit(train_data=self.no_support_data_2) + + def test_evaluate(self): + with self.assertRaises(TypeError): + + engine = auto.Engine( + model=MLPLayer(), + loss=paddle.nn.CrossEntropyLoss(), + metrics=paddle.metric.Accuracy(), + ) + engine.evaluate(valid_data=self.no_support_data_3) + + with self.assertRaises(TypeError): + + engine = auto.Engine( + model=MLPLayer(), + loss=paddle.nn.CrossEntropyLoss(), + metrics=paddle.metric.Accuracy(), + ) + engine.evaluate( + valid_data=self.no_support_data_4, valid_sample_split=1 + ) + + def test_predict(self): + with self.assertRaises(TypeError): + + engine = auto.Engine(model=MLPLayer()) + engine.predict( + test_data=self.no_support_data_3, test_sample_split=1 + ) + + with self.assertRaises(TypeError): + + engine = auto.Engine(model=MLPLayer()) + engine.predict( + test_data=self.no_support_data_4, test_sample_split=1 + ) + + def build_program(self): + main_prog = static.Program() + startup_prog = static.Program() + with static.program_guard(main_prog, startup_prog): + input = static.data( + name="input", + shape=[batch_size // 2, image_size], + dtype='float32', + ) + label = static.data( + name="label", shape=[batch_size // 2, 1], dtype='int64' + ) + mlp = MLPLayer() + loss = paddle.nn.CrossEntropyLoss() + predict = mlp(input) + loss_var = loss(predict, label) + return main_prog, startup_prog, input, label, loss_var + + def test_prepare(self): + with self.assertRaises(ValueError): + engine = auto.Engine(model=MLPLayer()) + engine.prepare() + + with self.assertRaises(AssertionError): + engine = auto.Engine(model=MLPLayer()) + engine.prepare(mode="train") + + with self.assertRaises(TypeError): + input = static.data( + name="input", + shape=[batch_size / 2, image_size], + dtype='float32', + ) + label = static.data( + name="label", shape=[batch_size / 2, 1], dtype='int64' + ) + engine = auto.Engine(model=MLPLayer()) + engine.prepare(inputs_spec=input, labels_spec=label, mode="eval") + + input_spec = static.InputSpec( + shape=[batch_size, image_size], dtype="float32", name="input" + ) + label_spec = static.InputSpec( + shape=[batch_size, image_size], dtype="float32", name="input" + ) + ( + main_prog, + startup_prog, + input_var, + label_var, + loss_var, + ) = self.build_program() + + with self.assertRaises(TypeError): + engine = auto.Engine(loss=loss_var) + engine.prepare( + inputs=input_spec, + labels=label_spec, + main_program=main_prog, + startup_program=startup_prog, + mode="eval", + ) + + with self.assertRaises(AssertionError): + engine = auto.Engine(loss=loss_var) + engine.prepare( + inputs_spec=[input_spec, input_spec], + labels_spec=[label_spec, label_spec], + inputs=input_var, + labels=label_var, + main_program=main_prog, + startup_program=startup_prog, + mode="predict", + ) + + def test_cost(self): + with self.assertRaises(ValueError): + engine = auto.Engine(model=MLPLayer()) + engine.cost(mode="predict") + + +class TestEngineDynamicErrorRaise(unittest.TestCase): + def setUp(self): + paddle.disable_static() + + def tearDown(self): + paddle.enable_static() + + def test_cost(self): + with self.assertRaises(ValueError): + engine = auto.Engine(model=MLPLayer()) + engine.cost(mode="predict") + + +if __name__ == "__main__": + unittest.main() From e77c062ef1d2503fb1dd4139821488c5e7441701 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Fri, 28 Oct 2022 21:19:12 +0800 Subject: [PATCH 23/91] [Dygraph] Finish fixing mem bugs of no sync in DataParallel (#47444) --- .../fluid/distributed/collective/reducer.cc | 43 ++++++++++++------- paddle/fluid/distributed/collective/reducer.h | 3 +- paddle/fluid/imperative/reducer.cc | 9 ++-- paddle/fluid/imperative/reducer.h | 3 +- paddle/fluid/pybind/distributed_py.cc | 7 +-- paddle/fluid/pybind/imperative.cc | 1 - python/paddle/fluid/dygraph/parallel.py | 8 +++- 7 files changed, 41 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 2c26828e5e114..f04585ce1710f 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -588,10 +588,9 @@ void EagerReducer::TraverseBackwardGraph(const std::vector &outputs) { } } -void EagerReducer::PrepareForBackward(const std::vector &outputs, - const bool is_sync) { +void EagerReducer::PrepareForBackward(const std::vector &outputs) { VLOG(3) << "after forward, then reset count for backward."; - grad_need_hooks_ = is_sync; + grad_need_hooks_ = true; next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) { @@ -660,9 +659,25 @@ void EagerReducer::AddDistHook(size_t var_index) { var_index)); // gradient synchronization is not required when grad_need_hooks_ is false. - // if (!grad_need_hooks_) { - // return; - // } + if (!grad_need_hooks_) { + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto inside_group_index = var_locator.inside_group_index; + auto &group = groups_[group_index]; + auto &group_tensor = group.dense_tensors_[inside_group_index]; + + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + + if (!HasGrad(var_index)) { + group_tensor.ShareDataWith(phi::DenseTensor()); + } else { + auto grad_dense_tensor = + *(std::dynamic_pointer_cast(grad_tensor.impl())); + group_tensor.ShareDataWith(grad_dense_tensor); + } + return; + } VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name() << "@Grad] arrived and triggered disthook"; @@ -828,12 +843,10 @@ void EagerReducer::MarkGroupReady(size_t group_index) { for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; ++next_group_) { UNUSED auto &group = groups_[next_group_]; - if (grad_need_hooks_) { - if (group.is_sparse_) { - AllReduceSparse(&group, next_group_); - } else { - FusedAllReduceSchedule(&group, next_group_); - } + if (group.is_sparse_) { + AllReduceSparse(&group, next_group_); + } else { + FusedAllReduceSchedule(&group, next_group_); } } } @@ -921,14 +934,15 @@ void EagerReducer::ProcessUnusedDenseVars() { void EagerReducer::FinalizeBackward() { groups_need_finalize_ = false; + grad_need_hooks_ = false; for (auto &group : groups_) { - if (!group.is_sparse_ && grad_need_hooks_) { + if (!group.is_sparse_) { group.task->Synchronize(); } } for (auto &group : groups_) { - if (!group.is_sparse_ && grad_need_hooks_) { + if (!group.is_sparse_) { group.dense_contents_.reset(); } } @@ -940,7 +954,6 @@ void EagerReducer::FinalizeBackward() { VLOG(3) << "ProcessUnusedDenseVars is finished."; } - grad_need_hooks_ = false; VLOG(3) << "In the batch, Reducer is finished."; } diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index 74db3db746729..5d27086fdbec5 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -103,8 +103,7 @@ class EagerReducer { void InitializeGroups(const std::vector> &group_indices); void InitializeDenseGroups(const std::vector &tensor_indices_, EagerGroup *p_group); - void PrepareForBackward(const std::vector &outputs, - const bool is_sync); + void PrepareForBackward(const std::vector &outputs); void AddDistHook(size_t var_index); void MarkVarReady(const size_t var_index, const bool is_used_var); void MarkGroupReady(const size_t group_index); diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 3225222f61737..f89fe234c201a 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -675,10 +675,9 @@ void Reducer::TraverseBackwardGraph( // After each batch is calculated, the counter of each group(group.pending_) // and allreudce sequence counter(next_group_) will be cleaned up again. void Reducer::PrepareForBackward( - const std::vector> &outputs, - const bool is_sync) { + const std::vector> &outputs) { VLOG(3) << "after forward, then reset count for backward."; - grad_need_hooks_ = is_sync; + grad_need_hooks_ = true; next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](Group &group) { group.pending_ = group.variable_indices_.size(); @@ -711,9 +710,7 @@ void Reducer::PrepareForBackward( if (find_unused_vars_once_ || find_unused_vars_each_step_) { unused_vars_.clear(); - if (grad_need_hooks_) { - TraverseBackwardGraph(outputs); - } + TraverseBackwardGraph(outputs); // only check once in first step find_unused_vars_once_ = false; } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 902c3036acc78..c455f962788b8 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -146,8 +146,7 @@ class Reducer { void PrepareDeps(const std::unordered_set& init_nodes); void PrepareForBackward( - const std::vector>& outputs, - const bool is_sync); + const std::vector>& outputs); void AddDistHook(size_t var_index); diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index fe1d82c766a0e..b84dd7fcbe1bb 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -1407,14 +1407,11 @@ void BindDistributed(py::module *m) { .def(py::init(&CreateEagerReducer)) .def( "prepare_for_backward", - [](distributed::EagerReducer &self, - py::handle py_tensors, - bool is_sync) { + [](distributed::EagerReducer &self, py::handle py_tensors) { auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); - self.PrepareForBackward(params, is_sync); + self.PrepareForBackward(params); }, py::arg("tensors"), - py::arg("is_sync"), py::call_guard()); } diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index bd18d4b3319b2..1eb5f8bd4764c 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2569,7 +2569,6 @@ void BindImperative(py::module *m_ptr) { .def("prepare_for_backward", &imperative::Reducer::PrepareForBackward, py::arg("vars"), - py::arg("is_sync"), py::call_guard()); m.def("assign_group_by_size", diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 004c21c1346b1..51e0527e4fa99 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -818,9 +818,13 @@ def forward(self, x): def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) - if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad: + if ( + self._strategy.nranks > 1 + and framework._dygraph_tracer()._has_grad + and self.grad_need_sync + ): self._reducer.prepare_for_backward( - list(self._find_varbase(outputs)), self.grad_need_sync + list(self._find_varbase(outputs)) ) return outputs From 17fb92b355a7f8d0f505c3221087f69d16571f94 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 28 Oct 2022 21:39:47 +0800 Subject: [PATCH 24/91] generate static graph code for some ops by yaml (#47416) --- paddle/fluid/operators/angle_op.cc | 94 -------------- paddle/fluid/operators/argsort_op.cc | 115 ------------------ paddle/fluid/operators/bmm_op.cc | 103 ---------------- paddle/fluid/operators/bmm_op.h | 63 ---------- paddle/fluid/operators/determinant_op.cc | 67 ---------- paddle/phi/api/yaml/backward.yaml | 43 +++++++ .../generator/templates/operator_utils.c.j2 | 2 +- paddle/phi/api/yaml/legacy_backward.yaml | 43 ------- paddle/phi/api/yaml/legacy_ops.yaml | 36 ------ paddle/phi/api/yaml/op_compat.yaml | 24 ++++ paddle/phi/api/yaml/ops.yaml | 36 ++++++ paddle/phi/kernels/cpu/angle_grad_kernel.cc | 4 +- paddle/phi/kernels/gpu/angle_grad_kernel.cu | 4 +- paddle/phi/ops/compat/angle_sig.cc | 30 ----- paddle/phi/ops/compat/argsort_sig.cc | 29 ----- paddle/phi/ops/compat/bmm_sig.cc | 26 ---- paddle/phi/ops/compat/determinant_sig.cc | 28 ----- 17 files changed, 110 insertions(+), 637 deletions(-) delete mode 100644 paddle/fluid/operators/angle_op.cc delete mode 100644 paddle/fluid/operators/argsort_op.cc delete mode 100644 paddle/fluid/operators/bmm_op.cc delete mode 100644 paddle/fluid/operators/bmm_op.h delete mode 100644 paddle/phi/ops/compat/angle_sig.cc delete mode 100644 paddle/phi/ops/compat/argsort_sig.cc delete mode 100644 paddle/phi/ops/compat/bmm_sig.cc delete mode 100644 paddle/phi/ops/compat/determinant_sig.cc diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc deleted file mode 100644 index ccd5584e8dedf..0000000000000 --- a/paddle/fluid/operators/angle_op.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class AngleOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class AngleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The input tensor of angle op."); - AddOutput("Out", "(Tensor), The output tensor of angle op."); - AddComment(R"DOC( -Angle Operator. - -This operator is used to perform elementwise angle for input $X$. -$$out = angle(x)$$ - -)DOC"); - } -}; - -class AngleGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(dtype, ctx.GetPlace()); - } -}; - -template -class AngleGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr retv) const override { - retv->SetType("angle_grad"); - retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - retv->SetInput("X", this->Input("X")); - retv->SetAttrMap(this->Attrs()); - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(angle, - AngleInferShapeFunctor, - PD_INFER_META(phi::RealAndImagInferMeta)); - -DECLARE_INFER_SHAPE_FUNCTOR(angle_grad, - AngleGradInferShapeFunctor, - PD_INFER_META(phi::AngleGradInferMeta)); - -REGISTER_OPERATOR(angle, - ops::AngleOp, - ops::AngleOpMaker, - ops::AngleGradMaker, - ops::AngleGradMaker, - AngleInferShapeFunctor); - -REGISTER_OPERATOR(angle_grad, ops::AngleGradOp, AngleGradInferShapeFunctor); diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc deleted file mode 100644 index f17723bf83f65..0000000000000 --- a/paddle/fluid/operators/argsort_op.cc +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class ArgsortOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class ArgsortGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); - } -}; - -class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) The input of Argsort op."); - AddOutput("Out", - "(Tensor) The sorted tensor of Argsort op, with the same " - "shape as Input(X)."); - AddOutput("Indices", - "(Tensor) The indices of a tensor giving the sorted order, with " - "the same shape as Input(X)."); - AddComment(R"DOC( -Argsort operator - -Performs sorting on the input tensor along the given axis and outputs two -tensors, Output(Out) and Output(Indices). They reserve the same shape -with Input(X), and Output(Out) represents the sorted tensor while -Output(Indices) gives the sorted order along the given axis Attr(axis). - - )DOC"); - AddAttr("axis", - "(int, default -1) The axis along which to sort the tensor. " - "When axis < 0, the actual axis will be the |axis|'th " - "counting backwards. Default -1, the last dimension.") - .SetDefault(-1); - AddAttr( - "descending", - "(bool, default false) The descending attribute is a flag to tell" - "algorithm how to sort the input data." - "If descending is true, will sort by descending order," - "else if false, sort by ascending order. Default value is false.") - .SetDefault(false); - } -}; - -template -class ArgsortGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("argsort_grad"); - op->SetInput("Indices", this->Output("Indices")); - op->SetInput("X", this->Input("X")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(argsort, - ArgsortInferShapeFunctor, - PD_INFER_META(phi::ArgsortInferMeta)); -REGISTER_OPERATOR(argsort, - ops::ArgsortOp, - ops::ArgsortOpMaker, - ops::ArgsortGradOpMaker, - ops::ArgsortGradOpMaker, - ArgsortInferShapeFunctor); -REGISTER_OPERATOR(argsort_grad, - ops::ArgsortGradOp, - ops::ArgsortGradNoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc deleted file mode 100644 index b27594eed3a3e..0000000000000 --- a/paddle/fluid/operators/bmm_op.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#include "paddle/fluid/operators/bmm_op.h" - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class BmmOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.device_context()); - } -}; - -class BmmOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The first input tensor of Bmm op."); - AddInput("Y", "(Tensor), The second input tensor of Bmm op."); - AddOutput("Out", "(Tensor), The output tensor of Bmm op."); - AddComment(R"DOC( -The Bmm operator is used to perform batched matrix multiplication -over the last two dimensions of the input tensors `X` and `Y` -which are both 3-dimentionsal. - -Examples: -- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N] - - )DOC"); - } -}; - -class BmmOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); - } -}; - -template -class BmmOpGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr retv) const override { - retv->SetType("bmm_grad"); - retv->SetInput("X", this->Input("X")); - retv->SetInput("Y", this->Input("Y")); - retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(bmm, - BmmInferShapeFunctor, - PD_INFER_META(phi::BmmInferMeta)); -DECLARE_INFER_SHAPE_FUNCTOR(bmm_grad, - BmmGradInferShapeFunctor, - PD_INFER_META(phi::BmmGradInferMeta)); -REGISTER_OPERATOR(bmm, - ops::BmmOp, - ops::BmmOpMaker, - ops::BmmOpGradMaker, - ops::BmmOpGradMaker, - BmmInferShapeFunctor); -REGISTER_OPERATOR(bmm_grad, ops::BmmOpGrad, BmmGradInferShapeFunctor); diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h deleted file mode 100644 index 5ca8df0182049..0000000000000 --- a/paddle/fluid/operators/bmm_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#ifndef PADDLE_FLUID_OPERATORS_BMM_OP_H_ -#define PADDLE_FLUID_OPERATORS_BMM_OP_H_ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { - -using Tensor = phi::DenseTensor; - -static void ReshapeTensorIntoMatrixSequence( - phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) { - int64_t h, w; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - - x->Resize({descriptor.batch_size_, h, w}); -} - -static void ReshapeXYOutIntoMatrixSequence(phi::DenseTensor *x, - phi::DenseTensor *y, - phi::DenseTensor *out, - bool trans_x, - bool trans_y) { - auto x_dim = x->dims(); - auto y_dim = y->dims(); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, false); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, false); - - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, - mat_dim_y.width_}); - - ReshapeTensorIntoMatrixSequence(x, mat_dim_x); - ReshapeTensorIntoMatrixSequence(y, mat_dim_y); -} - -} // namespace operators -} // namespace paddle -#endif // PADDLE_FLUID_OPERATORS_BMM_OP_H_ diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 89d5d2ded15f9..56e39747afc5e 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -23,57 +23,6 @@ namespace paddle { namespace operators { -class DeterminantOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Input", "(Tensor) The input tensor of determinant."); - AddOutput("Out", - "(Tensor) The output Tensor containing the determinant" - "value of a square matrix or batches of square matrices "); - - AddComment(R"DOC( -Determinant Operator.)DOC"); - } -}; - -class DeterminantGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } -}; - -template -class DeterminantGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("determinant_grad"); - grad_op->SetInput("Input", this->Input("Input")); - grad_op->SetInput("Out", this->Output("Out")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("Input"), - this->InputGrad("Input")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(DeterminantGradNoNeedBufferVarsInferer, - "Input"); - class SlogDeterminantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -154,22 +103,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; -DECLARE_INFER_SHAPE_FUNCTOR(determinant, - DeterminantInferShapeFunctor, - PD_INFER_META(phi::UnchangedInferMeta)); -REGISTER_OPERATOR(determinant, - ops::DeterminantOp, - ops::DeterminantOpMaker, - ops::DeterminantGradOpMaker, - ops::DeterminantGradOpMaker, - DeterminantInferShapeFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, - DeterminantGradInferShapeFunctor, - PD_INFER_META(phi::GeneralUnaryGradInferMeta)); -REGISTER_OPERATOR(determinant_grad, - ops::DeterminantGradOp, - DeterminantGradInferShapeFunctor); DECLARE_INFER_SHAPE_FUNCTOR(slogdeterminant, SlogDeterminantInferShapeFunctor, diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index db97795b5b425..cb51e8fa13f4f 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -20,6 +20,28 @@ func : acosh_grad inplace : (out_grad -> x_grad) +- backward_op : angle_grad + forward : angle (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : angle_grad + +- backward_op : argsort_grad + forward : argsort (Tensor x, int axis, bool descending) -> Tensor(out), Tensor(indices) + args : (Tensor indices, Tensor x, Tensor out_grad, int axis, bool descending) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : argsort_grad + data_type : out_grad + no_need_buffer : x + - backward_op : asin_grad forward : asin (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -74,6 +96,16 @@ func : atanh_grad inplace : (out_grad -> x_grad) +- backward_op : bmm_grad + forward : bmm (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : BmmGradInferMeta + kernel : + func : bmm_grad + data_type : out_grad + - backward_op : cholesky_grad forward : cholesky (Tensor x, bool upper) -> Tensor(out) args : (Tensor out, Tensor out_grad, bool upper) @@ -127,6 +159,17 @@ func : cross_grad data_type : out_grad +- backward_op : det_grad + forward : det (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : determinant_grad + data_type : out_grad + - backward_op : diag_grad forward : diag (Tensor x, int offset, float padding_value) -> Tensor(out) args : (Tensor x, Tensor out_grad, int offset) diff --git a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 index d2b0cf3290b8d..60fd251f446d2 100644 --- a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 @@ -109,7 +109,7 @@ KernelSignature {{api["op_name"] | to_pascal_case }}OpArgumentMapping(const Argu {% endfor %} {{get_output_list(api["outputs"], kernel_args)}}; {% if api["kernel"]["func"] | length == 1 %} - KernelSignature sig("{{api["name"]}}", std::move(inputs), std::move(attrs), std::move(outputs)); + KernelSignature sig("{{api["kernel"]["func"][0]}}", std::move(inputs), std::move(attrs), std::move(outputs)); return sig; {% else %}{# it has kernel for selected rows #} const char* kernel_name = ctx.IsSelectedRowsInput({{kernel_args[0] | to_opmaker_name_cstr}}) ? "{{api["kernel"]["func"][1]}}" : "{{api["kernel"]["func"][0]}}"; diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index ced3d75bb9639..916f5c405d7a7 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -100,30 +100,6 @@ kernel : func : amin_grad -- backward_op : angle_grad - forward : angle (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : angle_grad - data_transform: - skip_transform : out_grad - -- backward_op : argsort_grad - forward : argsort (Tensor x, int axis, bool descending) -> Tensor(out), Tensor(indices) - args : (Tensor indices, Tensor x, Tensor out_grad, int axis, bool descending) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : argsort_grad - data_type : out_grad - no_need_buffer : x - - backward_op : as_complex_grad forward : as_complex (Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -222,15 +198,6 @@ kernel : func : bilinear_tensor_product_grad -- backward_op : bmm_grad - forward : bmm (Tensor x, Tensor y) -> Tensor(out) - args : (Tensor x, Tensor y, Tensor out_grad) - output : Tensor(x_grad), Tensor(y_grad) - infer_meta : - func : BmmGradInferMeta - kernel : - func : bmm_grad - - backward_op : brelu_grad forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out) args : (Tensor x, Tensor out_grad, float t_min, float t_max) @@ -515,16 +482,6 @@ kernel : func : depthwise_conv2d_transpose_grad -- backward_op : det_grad - forward : det (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : determinant_grad - - backward_op : divide_double_grad forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index b0d79886c14dd..de290bd169f6e 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -144,15 +144,6 @@ func : amin backward : amin_grad -- op : angle - args : (Tensor x) - output : Tensor - infer_meta : - func : RealAndImagInferMeta - kernel : - func : angle - backward : angle_grad - - op : any args : (Tensor x, int64_t[] axis={}, bool keepdim=false) output : Tensor(out) @@ -191,15 +182,6 @@ kernel : func : arg_min -- op : argsort - args : (Tensor x, int axis=-1, bool descending=false) - output : Tensor(out), Tensor(indices) - infer_meta : - func : ArgsortInferMeta - kernel : - func : argsort - backward : argsort_grad - - op : as_complex args : (Tensor x) output : Tensor @@ -355,15 +337,6 @@ kernel : func : bitwise_xor -- op : bmm - args : (Tensor x, Tensor y) - output : Tensor - infer_meta : - func : BmmInferMeta - kernel : - func : bmm - backward : bmm_grad - - op : box_coder args : (Tensor prior_box, Tensor prior_box_var, Tensor target_box, str code_type, bool box_normalized, int axis, float[] variance) output : Tensor(output_box) @@ -618,15 +591,6 @@ func : depthwise_conv2d_transpose backward : depthwise_conv2d_transpose_grad -- op : det - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : determinant - backward : det_grad - - op : diag_embed args : (Tensor input, int offset, int dim1, int dim2) output : Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 59d258f0b0a88..304027861e3d6 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -41,9 +41,20 @@ - op : angle backward : angle_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false] +- op : argsort + inputs : + x : X + outputs : + out : Out + indices : Indices + - op : asin inputs : x : X @@ -101,6 +112,12 @@ extra : attrs : [bool use_mkldnn = false] +- op : bmm + inputs : + {x : X, y : Y} + outputs : + out : Out + - op : ceil backward : ceil_grad extra : @@ -226,6 +243,13 @@ extra : attrs : [float moving_rate = 0.9] +- op : det (determinant) + backward : det_grad (determinant_grad) + inputs : + x : Input + outputs : + out : Out + - op : diag (diag_v2) backward : diag_grad (diag_v2_grad) inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index ec1ba17be672d..e61b7490a15f7 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -16,6 +16,24 @@ func : acosh backward : acosh_grad +- op : angle + args : (Tensor x) + output : Tensor + infer_meta : + func : RealAndImagInferMeta + kernel : + func : angle + backward : angle_grad + +- op : argsort + args : (Tensor x, int axis=-1, bool descending=false) + output : Tensor(out), Tensor(indices) + infer_meta : + func : ArgsortInferMeta + kernel : + func : argsort + backward : argsort_grad + - op : asin args : (Tensor x) output : Tensor @@ -69,6 +87,15 @@ kernel : func : bernoulli +- op : bmm + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : BmmInferMeta + kernel : + func : bmm + backward : bmm_grad + - op : cholesky args : (Tensor x, bool upper=false) output : Tensor @@ -115,6 +142,15 @@ data_type : x backward : cross_grad +- op : det + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : determinant + backward : det_grad + - op : diag args : (Tensor x, int offset = 0, float padding_value = 0.0) output : Tensor diff --git a/paddle/phi/kernels/cpu/angle_grad_kernel.cc b/paddle/phi/kernels/cpu/angle_grad_kernel.cc index d12501916d85d..e3b10f0fc4b2e 100644 --- a/paddle/phi/kernels/cpu/angle_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/angle_grad_kernel.cc @@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(angle_grad, float, double, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/gpu/angle_grad_kernel.cu b/paddle/phi/kernels/gpu/angle_grad_kernel.cu index 062c39a9d1f3f..e32c50e4c42e2 100644 --- a/paddle/phi/kernels/gpu/angle_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/angle_grad_kernel.cu @@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(angle_grad, float, double, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/ops/compat/angle_sig.cc b/paddle/phi/ops/compat/angle_sig.cc deleted file mode 100644 index 63b10e6bf401d..0000000000000 --- a/paddle/phi/ops/compat/angle_sig.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature AngleOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("angle", {"X"}, {}, {"Out"}); -} - -KernelSignature AngleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("angle_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(angle, phi::AngleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(angle_grad, phi::AngleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc deleted file mode 100644 index 70531f16916dd..0000000000000 --- a/paddle/phi/ops/compat/argsort_sig.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature ArgsortGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("argsort_grad", - {"Indices", "X", "Out@GRAD"}, - {"axis", "descending"}, - {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/bmm_sig.cc b/paddle/phi/ops/compat/bmm_sig.cc deleted file mode 100644 index 415a90c3d3b3f..0000000000000 --- a/paddle/phi/ops/compat/bmm_sig.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature BmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "bmm_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(bmm_grad, phi::BmmGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc deleted file mode 100644 index ee1d53704c123..0000000000000 --- a/paddle/phi/ops/compat/determinant_sig.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature DeterminantGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "determinant_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(determinant_grad, - phi::DeterminantGradOpArgumentMapping); From c036c5c0b9f28bcd7a48592b9f5dc78046837924 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 28 Oct 2022 22:45:35 +0800 Subject: [PATCH 25/91] Add fused_allreduce_gradients_with_group for PPFleetX (#47447) * add fused_allreduce_gradients_with_group * add scale * fix ci --- .../fleet/utils/hybrid_parallel_util.py | 50 +++++++++++++------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index fec3e455f8ab2..c88a967035874 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -26,7 +26,7 @@ __all__ = [] -def _apply_collective_grads(parameters, comm_group): +def _apply_collective_grads(parameters, comm_group, bucket_size, scale=None): grad_var_set = set() grad_vars = [] sparse_grad_vars = [] @@ -41,28 +41,35 @@ def _apply_collective_grads(parameters, comm_group): assert g_var not in grad_var_set grad_var_set.add(g_var) - coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024) + coalesced_grads_and_vars = build_groups(grad_vars, bucket_size) nranks = ( paddle.distributed.get_world_size() if comm_group is None else comm_group.nranks ) + + scale = nranks if scale is None else 1.0 / scale + scale = None if scale == 1.0 else scale + for coalesced_grad, _, _ in coalesced_grads_and_vars: # need to div nranks - div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype) - paddle.fluid.framework._dygraph_tracer().trace_op( - type="elementwise_div", - inputs={'X': coalesced_grad, 'Y': div_factor}, - outputs={'Out': coalesced_grad}, - attrs={'axis': -1}, - ) + if scale is not None: + div_factor = paddle.to_tensor(scale, dtype=coalesced_grad.dtype) + paddle.fluid.framework._dygraph_tracer().trace_op( + type="elementwise_div", + inputs={'X': coalesced_grad, 'Y': div_factor}, + outputs={'Out': coalesced_grad}, + attrs={'axis': -1}, + ) paddle.distributed.all_reduce(coalesced_grad, group=comm_group) _split_tensors(coalesced_grads_and_vars) -def _apply_collective_grads_eager(parameters, comm_group): +def _apply_collective_grads_eager( + parameters, comm_group, bucket_size, scale=None +): grad_var_set = set() grad_vars = [] @@ -76,16 +83,21 @@ def _apply_collective_grads_eager(parameters, comm_group): assert g_var not in grad_var_set grad_var_set.add(g_var) - coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024) + coalesced_grads_and_vars = build_groups(grad_vars, bucket_size) nranks = ( paddle.distributed.get_world_size() if comm_group is None else comm_group.nranks ) + + scale = 1.0 / nranks if scale is None else scale + scale = None if scale == 1.0 else scale + for coalesced_grad, _, _ in coalesced_grads_and_vars: # need to div nranks - coalesced_grad.scale_(1.0 / nranks) + if scale is not None: + coalesced_grad.scale_(scale) paddle.distributed.all_reduce(coalesced_grad, group=comm_group) _split_tensors(coalesced_grads_and_vars) @@ -172,16 +184,22 @@ def broadcast_dp_parameters(model, hcg): ) -def fused_allreduce_gradients(parameter_list, hcg): - data_parallel_group = None if hcg is None else hcg.get_data_parallel_group() - logger.debug("dp start fuse allreduce gradients") +def fused_allreduce_gradients_with_group( + parameter_list, group, bucket_size=128 * 1024 * 1024, scale=None +): apply_func = ( _apply_collective_grads_eager if in_dygraph_mode() else _apply_collective_grads ) with framework.no_grad(): - apply_func(parameter_list, data_parallel_group) + apply_func(parameter_list, group, bucket_size) + + +def fused_allreduce_gradients(parameter_list, hcg): + data_parallel_group = None if hcg is None else hcg.get_data_parallel_group() + logger.debug("dp start fuse allreduce gradients") + fused_allreduce_gradients_with_group(parameter_list, data_parallel_group) def sharding_reduce_gradients(parameter_list, hcg): From 67ca9d45879f24bc974191dbb01b6d9c1069c833 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Sat, 29 Oct 2022 22:58:04 +0800 Subject: [PATCH 26/91] [INCUBATE] Add dist save/load for sharding stage2 (#46908) --- .../unittests/collective/fleet/CMakeLists.txt | 9 + .../fleet/dygraph_dist_save_load.py | 419 ++++++++++++++++++ .../fleet/test_dygraph_dist_save_load.py | 38 ++ .../unittests/collective/fleet/testslist.csv | 3 +- .../incubate/distributed/utils/io/__init__.py | 16 + .../distributed/utils/io/dist_load.py | 120 +++++ .../distributed/utils/io/dist_save.py | 392 ++++++++++++++++ python/setup.py.in | 2 + 8 files changed, 998 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_dist_save_load.py create mode 100644 python/paddle/incubate/distributed/utils/io/__init__.py create mode 100644 python/paddle/incubate/distributed/utils/io/dist_load.py create mode 100644 python/paddle/incubate/distributed/utils/io/dist_save.py diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt index b47e4b5b530f9..f853e962043f2 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt @@ -942,4 +942,13 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT) py_test_modules( test_fleet_log MODULES test_fleet_log ENVS "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python") + set_tests_properties(test_fleet_log PROPERTIES TIMEOUT "200" LABELS + "RUN_TYPE=DIST") +endif() +if((WITH_GPU) AND (LINUX)) + py_test_modules( + test_dygraph_dist_save_load MODULES test_dygraph_dist_save_load ENVS + "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python") + set_tests_properties(test_dygraph_dist_save_load + PROPERTIES TIMEOUT "200" LABELS "RUN_TYPE=DIST") endif() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py new file mode 100644 index 0000000000000..0ade6b0cb7d0a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_dist_save_load.py @@ -0,0 +1,419 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import numpy as np +import tempfile +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import ( + GroupShardedOptimizerStage2, +) +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import ( + GroupShardedStage2, +) + +import sys +import subprocess +import argparse + +from paddle import distributed as dist +from paddle.incubate.distributed.utils.io import save, load + +print(load) +epoch = 2 +linear_size = 1000 + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW( + parameters=[ + { + "params": model.parameters(), + } + ] + if opt_group + else model.parameters(), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16, + ) + + return optimizer + + +def train_mlp( + model, + sharding_stage, + batch_size=100, + use_pure_fp16=False, + accumulate_grad=False, + opt_group=False, + save_model=False, + test_minimize=False, + opt_state=None, +): + if sharding_stage != "dp": + group = paddle.distributed.new_group([0, 1], backend="nccl") + if opt_group: + optimizer = optimizer_setting( + model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group + ) + else: + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + + if sharding_stage == 2: + optimizer = GroupShardedOptimizerStage2( + params=optimizer._parameter_list, optim=optimizer, group=group + ) + model = GroupShardedStage2( + model, optimizer, group=group, buffer_max_size=2**21 + ) + model._set_reduce_overlap(True) + optimizer._set_broadcast_overlap(True, model) + else: + model = paddle.DataParallel(model) + + # check optimizer.minimize() error + if test_minimize: + try: + optimizer.minimize() + except: + print( + "====== Find sharding_stage2_optimizer.minimize() error ======" + ) + return + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True + ) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True, + ) + train_loader.set_sample_list_generator(train_reader) + + if sharding_stage == 2: + model.to(device="gpu") + if opt_state is not None: + optimizer.set_state_dict(opt_state) + + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + out = model(img) + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + if batch_size == 20: + avg_loss = avg_loss / 5 + avg_loss.backward() + + if not accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + if accumulate_grad: + optimizer.step() + optimizer.clear_grad() + + paddle.device.cuda.synchronize() + + if save_model: + return model, optimizer + return model.parameters() + + +def save_model(model, output_dir, **configs): + configs["save_model"] = True + model, opt = train_mlp(model, **configs) + + model_file = os.path.join( + output_dir, f"rank{dist.get_rank()}model.pdparams" + ) + opt_file = os.path.join(output_dir, f"rank{dist.get_rank()}model.pdopt") + + g_model_file = os.path.join( + output_dir, f"rank{dist.get_rank()}g_model.pdparams" + ) + g_opt_file = os.path.join(output_dir, f"rank{dist.get_rank()}g_model.pdopt") + + paddle.save(model.state_dict(), model_file) + paddle.save(opt.state_dict(), opt_file) + + save( + model.state_dict(), g_model_file, gather_to=[0, 1], state_type="params" + ) + save(opt.state_dict(), g_opt_file, gather_to=[0, 1], state_type="opt") + + +def load_mode(model, model_state_dict, output_param_path, **configs): + configs["save_model"] = False + model.set_state_dict(model_state_dict) + params = train_mlp(model, **configs) + paddle.save(params, output_param_path) + + +def step_check(path1, path2): + m1 = paddle.load(path1) + m2 = paddle.load(path2) + for v1, v2 in zip(m1, m2): + assert np.allclose(v1.numpy(), v2.numpy()) + print(f"value same: {v1.name}") + + +def step_save(strategy, output_dir, seed): + python_exe = sys.executable + # save data + os.makedirs(output_dir + "/logs", exist_ok=True) + filename = os.path.basename(__file__) + cmd = ( + f"{python_exe} -m paddle.distributed.launch --log_dir {output_dir}/logs" + f" --gpus 0,1 {filename} --cmd save --strategy {strategy} --output_dir {output_dir} --seed {seed}" + ) + p = subprocess.Popen(cmd.split()) + p.communicate() + assert p.poll() == 0 + + +def step_load( + saved_strategy, curent_strateggy, saved_dir, load_way, output_path, seed +): + python_exe = sys.executable + os.makedirs(f"{saved_dir}/load/logs", exist_ok=True) + filename = os.path.basename(__file__) + # load dp + cmd = ( + f"{python_exe} -m paddle.distributed.launch --log_dir {saved_dir}/load/logs" + f" --gpus 0,1 {filename} --cmd load --strategy {curent_strateggy} --output_dir {saved_dir} --load_dir {saved_dir}/{saved_strategy}/save --load_way {load_way}" + f" --output_param_path {output_path} --seed {seed}" + ) + p = subprocess.Popen(cmd.split()) + p.communicate() + assert p.poll() == 0 + + +def test_save_load(args): + + np.random.seed(args.seed) + paddle.seed(args.seed) + + if args.cmd == "main": + run_case(args) + return + + paddle.distributed.init_parallel_env() + strategy = fleet.DistributedStrategy() + if args.strategy == "dp": + strategy.hybrid_configs = { + "dp_degree": 2, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": 1, + } + elif args.strategy == "sharding_stage2": + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": 2, + } + else: + raise ValueError(f"Not supported strategy: {args.strategy}") + + fleet.init(is_collective=True, strategy=strategy) + fleet.set_log_level("DEBUG") + + mlp1 = MLP() + output_dir = os.path.join(args.output_dir, args.strategy, args.cmd) + os.makedirs(output_dir, exist_ok=True) + + if args.cmd.lower() == "save": + if args.strategy == "dp": + # DP VS stage2 + save_model( + mlp1, + output_dir, + sharding_stage="dp", + use_pure_fp16=False, + opt_group=False, + save_model=True, + ) + elif args.strategy == "sharding_stage2": + save_model( + mlp1, + output_dir, + sharding_stage=2, + use_pure_fp16=False, + opt_group=False, + save_model=True, + ) + else: + raise ValueError(f"Not supported {args.strategy}") + elif args.cmd.lower() == "load": + output_dir = args.load_dir + model_file = os.path.join( + output_dir, f"rank{dist.get_rank()}model.pdparams" + ) + opt_file = os.path.join(output_dir, f"rank{dist.get_rank()}model.pdopt") + g_model_file = os.path.join( + output_dir, f"rank{args.gather_to}g_model.pdparams" + ) + g_opt_file = os.path.join( + output_dir, f"rank{args.gather_to}g_model.pdopt" + ) + + if args.load_way == "load": + model_file = g_model_file + opt_file = g_opt_file + load_ = lambda x: eval(args.load_way)(x, place='cpu') + else: + load_ = eval(args.load_way) + + model = load_(model_file) + opt = load_(opt_file) + for k in opt.keys(): + print("opt k:", k) + if args.strategy == "dp": + load_mode( + mlp1, + model, + args.output_param_path, + sharding_stage="dp", + use_pure_fp16=False, + opt_group=False, + save_model=False, + opt_state=opt, + ) + elif args.strategy == "sharding_stage2": + load_mode( + mlp1, + model, + args.output_param_path, + sharding_stage=2, + use_pure_fp16=False, + opt_group=False, + save_model=False, + opt_state=opt, + ) + else: + raise ValueError(f"Not supported strategy {args.strategy}") + + else: + raise ValueError(f"Not supported cmd: {args.cmd}") + + +def run_case(args): + + saving_strategy = args.test_case.split(":")[0] + loading_strategy = args.test_case.split(":")[1] + + output_dir = tempfile.mkdtemp() + print("output dir:", output_dir) + os.makedirs(output_dir + "/load_save", exist_ok=True) + # save dp + step_save(saving_strategy, output_dir, args.seed) + # return + + # load dp + p1 = os.path.join(output_dir, "m1.pdparams") + p2 = os.path.join(output_dir, "m2.pdparams") + + step_load( + saving_strategy, + saving_strategy, + output_dir, + "paddle.load", + p1, + args.seed + 1, + ) + step_load( + saving_strategy, loading_strategy, output_dir, "load", p2, args.seed + 2 + ) + + # check + step_check(p1, p2) + + shutil.rmtree(output_dir) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument( + "--cmd", default="main", choices=["main", "save", "load"] + ) + parser.add_argument( + "--strategy", required=False, choices=["dp", "sharding_stage2"] + ) + parser.add_argument( + "--load_way", choices=["paddle.load", "load"], required=False + ) + parser.add_argument("--load_dir", required=False) + parser.add_argument("--output_dir", required=False) + parser.add_argument("--output_param_path", required=False) + parser.add_argument( + "--test_case", + required=False, + choices=[ + "dp:dp", + "dp:sharding_stage2", + "sharding_stage2:dp", + "sharding_stage2:sharding_stage2", + ], + ) + parser.add_argument("--gather_to", required=False, default=0) + parser.add_argument("--seed", type=int, default=2022) + + args = parser.parse_args() + test_save_load(args) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_dist_save_load.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_dist_save_load.py new file mode 100644 index 0000000000000..18aac82f86c83 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_dist_save_load.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import subprocess +import sys + + +def strategy_test(saving, loading, gather_to): + cmd = f"{sys.executable} dygraph_dist_save_load.py --test_case {saving}:{loading} --gather_to {gather_to}" + p = subprocess.Popen(cmd.split()) + p.communicate() + assert p.poll() == 0 + + +class TestDistSaveLoad(unittest.TestCase): + def test_dygraph_save_load_dp_sharding_stage2(self): + strategy_test("dp", "sharding_stage2", 0) + strategy_test("dp", "sharding_stage2", 1) + strategy_test("sharding_stage2", "dp", 1) + + +if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" + os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv index c7fa546322573..15cfa81b51ad1 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv @@ -82,4 +82,5 @@ test_hdfs1,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_ test_hdfs2,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_hdfs3,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_checkpoint,LINUX,GPU;ROCM,200,EXCLUSIVE:NIGHTLY,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_log,,,,DIST,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_log,,,200,DIST,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_dygraph_dist_save_load,LINUX,GPU,200,DIST,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../.., diff --git a/python/paddle/incubate/distributed/utils/io/__init__.py b/python/paddle/incubate/distributed/utils/io/__init__.py new file mode 100644 index 0000000000000..7eacf695c74db --- /dev/null +++ b/python/paddle/incubate/distributed/utils/io/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dist_save import save +from .dist_load import load diff --git a/python/paddle/incubate/distributed/utils/io/dist_load.py b/python/paddle/incubate/distributed/utils/io/dist_load.py new file mode 100644 index 0000000000000..38907489c8d34 --- /dev/null +++ b/python/paddle/incubate/distributed/utils/io/dist_load.py @@ -0,0 +1,120 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.framework import dygraph_only +import paddle +import paddle.distributed as dist +from paddle.distributed import fleet +import re +import copy + + +@dygraph_only +def load(path, **configs): + """ + Load an object can be used in paddle from specified path. + The file is saved by distributed.save + + Note: + The file to load must be saved bu the API paddle.incubate.distributed.utils.io.save + + Args: + path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target + file path. When loading state_dict from the saved result of the API used to save + the inference model, the path may be a file prefix or directory. + **configs (dict, optional): other load configuration options for compatibility. We do not + recommend using these configurations, they may be removed in the future. If not necessary, + DO NOT use them. Default None. + The following options are currently supported: + (1) place: where to place the loaded state dict. + If the state dict is too large, the palce should be set 'cpu'. + Note: + Other config value may cause some error.Please don't use any more config options. + Returns: + Object(Object): a target object can be used in paddle + + Examples: + import paddle + paddle.distributed.init_process_group(backend='nccl') + paddle.distributed.fleet.init(is_collective=True) + + model = build_model() + optimizer = build_optimizer(model) + + dist_model = paddle.distributed_optimizer(model) + dist_optimizer = paddle.distributed_optimizer(optimizer) + + + # load model state dict + model_state_dict = paddle.incubate.distributed.utils.io.load(path="path/to/load.pdparams") + dist_model.set_state_dict(model_state_dict) + + # load optimizer satte dict + optimizer_state_dict = paddle.incubate.distributed.utils.io.load(path="path/to/load.pdopt") + dist_optimizer.set_state_dict(optimizer_state_dict) + + """ + if dist.get_world_size() == 1: + return paddle.load(path, **configs) + + hcg = fleet.get_hybrid_communicate_group() + assert ( + hcg.get_model_parallel_world_size() == 1 + and hcg.get_pipe_parallel_world_size() == 1 + ), "Sharding and DP are supported only now" + + # assert ( + # "place" in configs + # ), "the arg place ('cpu' or 'gpu:0', 'gpus:1' ...)must be passed" + if "place" not in configs: + configs["place"] = "cpu" + place = configs["place"] + assert isinstance( + place, str + ), f"configs[place] must be a str, but this is a {type(place)}" + + assert re.search( + "^(cpu|gpu:[0-9]*)$", place + ), "configs[place] must be cpu, gpu:0, gpu:1 ..." + + return load_with_place(path, **configs) + + +def load_with_place(path, **configs): + place = configs["place"] + if place is None: + return paddle.load(path) + + origin_place = paddle.get_device() + paddle.set_device(place) + + configs = _remove_not_supported_itmes(configs) + state_dict = paddle.load(path, **configs) + + paddle.set_device(origin_place) + + return state_dict + + +def _remove_not_supported_itmes(configs): + __supported_by_load__ = [ + "model_filename", + "params_filename", + "return_numpy", + ] + _configs = copy.copy(configs) + for k in configs.keys(): + if k not in __supported_by_load__: + _configs.pop(k, None) + return _configs diff --git a/python/paddle/incubate/distributed/utils/io/dist_save.py b/python/paddle/incubate/distributed/utils/io/dist_save.py new file mode 100644 index 0000000000000..363f54bcc6d15 --- /dev/null +++ b/python/paddle/incubate/distributed/utils/io/dist_save.py @@ -0,0 +1,392 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +import re +import paddle +from paddle.distributed.fleet.utils.log_util import logger +from paddle.fluid.framework import dygraph_only +import copy +import sys + +from paddle.distributed.fleet.utils.log_util import logger + +__all__ = ["save"] + + +@dygraph_only +def save(state_dict, path, **configs): + ''' + Save a state dict to the specified path in both distributed and single-card environment. + + Note: + Now supports saving ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program. + + Note: + Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, + there is no need to distinguish multiple saved files by adding a suffix. The argument ``path`` + of ``paddle.save`` will be directly used as the saved file name instead of a prefix. + In order to unify the saved file name format, we recommend using the paddle standard suffix: + 1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ; + 2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` . + For specific examples, please refer to API code examples. + + Args: + obj(Object) : The object to be saved. + path(str|BytesIO) : The path/buffer of the object to be saved. + If saved in the current directory, the input path string will be used as the file name. + protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5. + Default: 4 + **configs(dict, optional): optional keyword arguments. The following options are currently supported: + (1)use_binary_format(bool): + To be used in paddle.save. When the saved object is static graph variable, you can specify ``use_binary_for_var``. + If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format. + Default: False + (2)gather_to(int|list|tuple|None): + To specify which global rank to save in.Defalut is None. + None value means distributed saving with no gathering to a single card. + (3)state_type(str): + Value can be 'params' or 'opt', specifying to save parametres or optimizer state. + (4)max_grouped_size(str|int): + To limit the max size(how many bits) a object group to be transfered a time. + If str, the format must be as num+'G/M/K', for example, 3G, 2K, 10M, etc. Default is 3G. + Returns: + None + Examples: + import paddle + paddle.distributed.init_process_group(backend='nccl') + paddle.distributed.fleet.init(is_collective=True) + + model = build_model() + optimizer = build_optimizer(model) + + dist_optimizer = paddle.distributed_optimizer(optimizer) + dist_model = paddle.distributed_optimizer(model) + + # gather params to rank 0 and then save + paddle.incubate.distributed.utils.io.save(model.state_dict(), path="path/to/save.pdparams", gather_to=[0], state_type="params") + + # save whoe params on all ranks + paddle.incubate.distributed.utils.io.save(model.state_dict(), path="path/to/save.pdparams", gather_to=[0,1], state_type="params") + + # save optimizer state dict on rank 0 + paddle.incubate.distributed.utils.io.save(optimizer.state_dict(), path="path/to/save.pdopt", gather=0, state_type="opt") + + ''' + + gather_to = configs.get("gather_to", None) + if dist.get_world_size() == 1 or gather_to is None: + configs = _remove_not_supported_conf(configs) + return paddle.save(state_dict, path, **configs) + + # gather_to is not None and world size > 1 + state_type = configs.get("state_type", None) + assert isinstance( + state_type, str + ), "must pass an arg state_type='params' or state_type='opt' to specify whether to save model state_dict or optimizer state_dict" + assert state_type in [ + "params", + "opt", + ], "must pass an arg state_type='params' or state_type='opt'" + + if re.search(f"{state_type}$", path) is None: + logger.warning( + f"You are saving {state_type}, while the path({path} does not end with {state_type})" + ) + + hcg = fleet.get_hybrid_communicate_group() + assert ( + hcg.get_model_parallel_world_size() == 1 + and hcg.get_pipe_parallel_world_size() == 1 + ), f"Only DP and Sharding is supported now. However, current MP={hcg.get_model_parallel_world_size()} , PP={hcg.get_pipe_parallel_world_size()}" + + sharding_group = hcg.get_sharding_parallel_group() + dp_group = hcg.get_data_parallel_group() + + if state_type == "params": + if dp_group.nranks > 1: + assert _same_keys( + state_dict, dp_group + ), "only sharding stage 1/2 and DP are supported now" + if sharding_group.nranks > 1: + assert _same_keys( + state_dict, sharding_group + ), "only sharding stage 1/2 and DP are supported now" + configs = _remove_not_supported_conf(configs) + return paddle.save(state_dict, path, **configs) + + # state_type == "opt" + if sharding_group.nranks == 1: + configs = _remove_not_supported_conf(configs) + return paddle.save(state_dict, path, **configs) + if _same_keys(state_dict, sharding_group): + return paddle.save(state_dict, path, **configs) + assert isinstance(gather_to, (list, tuple, int)) + if isinstance(gather_to, int): + gather_to = [gather_to] + max_size = configs.get("max_grouped_size", "3G") + try: + logger.info("state_dict_keys:" + str(state_dict.keys())) + gathered_state_dict = _gather_state_dict( + state_dict, gather_to, sharding_group, max_size=max_size + ) + logger.info("gathered_state_dict_keys:" + str(state_dict.keys())) + if dist.get_rank() in gather_to: + configs = _remove_not_supported_conf(configs) + paddle.save(gathered_state_dict, path, **configs) + except: + raise RuntimeError( + f'''Saving failed. Follwing are some suggestions: + 1) pass the param max_grouped_size to turn the grouped size smaller (current value of max_grouped_size is {max_size}) + 2) if sharding stage is 1, use paddle.save rather than paddle.distributed.save + 3) Concat the developers +''' + ) + + +def _state_dict_groups(state_dict, max_size): + """ + Description: + Generator of state dict groups to transfer.the size of each group is less than max_size. + """ + + # find the max size of a whole tensor + # now we only support to transfer at least one whole tensor + max_tensor_size = 0 + for k, v in state_dict.items(): + if max_tensor_size < sys.getsizeof(v) + sys.getsizeof(k): + max_tensor_size = sys.getsizeof(v) + sys.getsizeof(k) + + max_size = max(max_size, max_tensor_size) + logger.debug(f"max tensor size: {max_size}") + + state_group = dict() + k_list = list(state_dict.keys()) + index = 0 + bits = 0 + + # generate groups utils the end + while index < len(k_list): + bsize = sys.getsizeof(state_dict[k_list[index]]) + sys.getsizeof( + k_list[index] + ) + if bits + bsize >= max_size: + yield state_group + state_group = dict() + bits = 0 + + state_group[k_list[index]] = state_dict[k_list[index]] + index += 1 + bits += bsize + + if index == len(k_list) and bits > 0: + yield state_group + + +def all_empty(dict_list): + """ + Check if all items are empty + """ + for v in dict_list: + if len(v) > 0: + return False + return True + + +def _parse_mem_size_to_bits(max_size): + """ + Parse an integer or a mem size str to an integer + convert xxxG to xxx * 1024^3 + convert xxxM to xxx * 1024^2 + convert xxxK to xxx * 1024^1 + """ + assert isinstance(max_size, (int, str)) + if isinstance(max_size, str): + assert re.search( + "^[0-9]*[GMK]$", max_size + ), f"Wrong max_size 's format, the format ust be like 10K, 9M, 200G , etc, or an integer. However this is {max_size}" + num = int(max_size[:-1]) + if max_size[-1] == "G": + max_size = num * 1024**3 + elif max_size[-1] == "M": + max_size = num * 1024**2 + else: + max_size = num * 1024 + return max_size + + +def _gather_state_dict(state_dict, dst, group, max_size="3G"): + """ + Description: + Gather state dicts across all group ranks to dst, Depiring the same elements. including LR_Scheduler. + Args: + state_dict(dict): + local state dict + dst(int|list|tuple): + ranks the state dicts are gathered to + group(ProcessGroup): + group across which the state dicts are gathered + max_size(int|str): + The max limitation of the gathered tensor group size transformered a time. Default is 3G bits. + Each rank 's max tensor group before gathering is max_size // group.size + Returns: + Gathered state dict + """ + assert isinstance( + dst, (list, tuple, int) + ), "dst' type must be one of int, list and tuple" + if isinstance(dst, int): + dst = [dst] + + max_size = _parse_mem_size_to_bits(max_size) + max_size //= dist.get_world_size(group) + + logger.debug("len state_dict: len(state_dict)") + + state_dict_ = copy.copy(state_dict) + mw = None + has_mw = False + has_lr = False + + # Remove master_weights and LR_Scheduler to ensure that all the elements of the state dict are str->Tensor + if "master_weights" in state_dict_: + mw = state_dict_.pop("master_weights", None) + has_mw = True + if "LR_Scheduler" in state_dict_: + lr = state_dict_.pop("LR_Scheduler", None) + has_lr = True + + # Gather optimizer state_dict + output = _grouped_gather_data_dict(state_dict_, dst, group, max_size) + + # Gather master_weights if it exists + if isinstance(mw, dict): + masters = _grouped_gather_data_dict(mw, dst, group, max_size) + else: + assert mw is None, f"Wrong type of master weights . type: {type(mw)}" + + # assign master_weights and LR_Scheduler + # Because LR_Schedulers are same across group, it just needs to be reset + if has_mw: + output["master_weights"] = masters + if has_lr: + output["LR_Scheduler"] = lr + return output + + +def _grouped_gather_data_dict(state_data_dict, dst, group, max_size): + """ + Description: + Gather state data dict by groups. + Args: + state__data_dict(dict): + local dict to transfer.The state_data_dict only contains the mapping: str->paddle.Tensor + dst(int|list|tuple): + ranks the state dicts are gathered to + group(ProcessGroup): + group across which the state dicts are gathered + max_size(int|str): + The max limitation of the gathered tensor group size transformered a time. Default is 3G bits. + Each rank 's max tensor group before gathering is max_size // group.size + Returns: + Gatherd state_data_dict + + """ + numpy_dict = {} + logger.debug(f"len state_tict_ : {len(state_data_dict)}") + + for k, v in state_data_dict.items(): + try: + numpy_dict[k] = v.numpy() + except: + raise TypeError( + f"the object (type of {type(v)}) of '{k}' is neither tensor nor parameter" + ) + + total = 0 + output_state = dict() + + logger.info("start all gather ...") + # gather all state_dict by groups + for state in _state_dict_groups(numpy_dict, max_size): + s_list = [] + total += len(state) + logger.info(f"gen to gather: {total} / {len(numpy_dict)}") + dist.all_gather_object(s_list, state, group) + if dist.get_rank() in dst: + for s in s_list: + for k, v in s.items(): + logger.debug(f"gathered: {k}, {v.shape}") + output_state.update(s) + + logger.debug( + f"s list size: {sum(len(s) for s in s_list)} output: {len(output_state)}" + ) + + # Because each size of groups may be different, here we should wait all objetcs gatherd. + # The while block breaks until all objects from every rank are empty, which means all of the objects transforming is done. + while True: + s_list = [] + state = {} + logger.debug("while True") + dist.all_gather_object(s_list, state, group) + if all_empty(s_list): + break + if dist.get_rank() in dst: + for s in s_list: + for k, v in s.items(): + logger.debug(f"gathered: {k}, {v.shape}") + output_state.update(s) + logger.debug( + f"s list size: {sum(len(s) for s in s_list)} output: {len(output_state)}" + ) + + logger.debug("all gathered ...") + + if dist.get_rank() in dst: + # convert numpy.ndarray to Tensor in cpu palce + place = paddle.CPUPlace() + for k in output_state.keys(): + output_state[k] = paddle.to_tensor(output_state[k], place=place) + output_state[k].name = k + return output_state + return {} + + +def _same_keys(state_dict, group): + """ + Check whther all keys in each dict in the group are the same. + Used in sharding strategy to determine whether a dict needs to be gathered. + """ + keys = list(state_dict.keys()) + key_list = [] + logger.info(keys) + dist.all_gather_object(key_list, keys, group=group) + for k in key_list: + if not k == keys: + return False + return True + + +def _remove_not_supported_conf(configs): + """ + Remove the config values not supported by paddle.save + """ + __supported_by_save__ = ["use_binary_format"] + configs_ = copy.copy(configs) + for k in configs.keys(): + if k not in __supported_by_save__: + configs_.pop(k, None) + return configs_ diff --git a/python/setup.py.in b/python/setup.py.in index 92b75cd06774b..76daa99e4b446 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -383,6 +383,8 @@ packages=['paddle', 'paddle.incubate.optimizer.functional', 'paddle.incubate.autograd', 'paddle.incubate.distributed', + 'paddle.incubate.distributed.utils', + 'paddle.incubate.distributed.utils.io', 'paddle.incubate.distributed.fleet', 'paddle.incubate.distributed.models', 'paddle.incubate.distributed.models.moe', From 605b3f98636918bfbc9d9480aafa9bd973b580fa Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Sun, 30 Oct 2022 23:18:54 +0800 Subject: [PATCH 27/91] Fix gen cmake (#47457) * maping from dist name scope to single name scope * update * fix gen cmake * support runtype is '' when using test_runner.py * Revert "fix gen cmake" This reverts commit d7a653d33aeacb8bb4a13957c9961ed9f626a18f. * update gen-ut-cmakelist; test=document_fix * revert code; test=document_fix --- .../unittests/collective/fleet/testslist.csv | 48 +++++++++---------- tools/gen_ut_cmakelists.py | 14 +++++- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv index 15cfa81b51ad1..14284a12059dc 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv @@ -1,23 +1,23 @@ name,os,arch,timeout,run_type,launcher,num_port,run_serial,envs,conditions test_fleet_sharding_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,350,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_static_mp_layers,LINUX;WIN32,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_dgc_op,,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC -test_dgc_optimizer,,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC +test_fleet_static_mp_layers,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_dgc_op,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC +test_dgc_optimizer,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC test_parallel_margin_cross_entropy,,GPU,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_dygraph_sharding_stage3,,GPU,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_parallel_dygraph_transformer,,GPU,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212 test_parallel_dygraph_transformer,,ROCM,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_fp16_allreduce_meta_optimizer,LINUX;WIN32,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_fp16_allreduce_meta_optimizer,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_rnn_dp,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_mp_layers,,GPU,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_tcp_store,LINUX;APPLE,,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_sharding_stage3_for_eager,,,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_graph_execution_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_communicator_half_async,,,120,DIST,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL -test_fleet_graph_executor,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_graph_executor,,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_pipeline_parallel,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_pipeline_parallel_with_virtual_stage,,GPU,500,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_localsgd_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_localsgd_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_class_center_sample,,GPU,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL test_pipeline,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_utils,LINUX;APPLE,,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., @@ -27,38 +27,38 @@ test_dygraph_sharding_stage2,,,200,DIST,../../dist_test.sh,2,,http_proxy=;https_ test_parallel_dygraph_control_flow,,,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_lars_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_hybrid_parallel_inference_helper,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_rolemaker_new,,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_rolemaker_new,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dist_mnist_gradient_merge,LINUX;WIN32,GPU;ROCM,360,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_recv_save_op,,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_communicator_sync,,,,DIST,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../.., +test_recv_save_op,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_communicator_sync,,,,,test_runner.py,2,,FLAGS_communicator_send_queue_size=1;FLAGS_communicator_max_merge_var_num=1;http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_pipeline_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_gradient_merge_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_amp_init,LINUX;WIN32,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_gradient_merge_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_amp_init,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_sharding_optimizer_stage2,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_meta_optimizer_base,LINUX;WIN32,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_meta_optimizer_base,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_raw_program_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_sharding_parallel,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_tensor_parallel,,,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_group_sharded_api_for_eager,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_distributed_strategy,LINUX;WIN32,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_dgc_meta_optimizer,,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC +test_fleet_distributed_strategy,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_dgc_meta_optimizer,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC test_parallel_dygraph_unused_variables,,,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_lamb_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_dgc_momentum_op,,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC +test_fleet_lamb_meta_optimizer,LINUX,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_dgc_momentum_op,,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_DGC test_parallel_dygraph_no_sync_gradient_check,,,60,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_fleet_pipeline_meta_optimizer_with_recompute,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_hybrid_meta_optimizer,LINUX;WIN32,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_hybrid_meta_optimizer,LINUX;WIN32,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_qat,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_sparse_embedding,,GPU,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212 test_parallel_dygraph_sparse_embedding,,ROCM,200,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_amp_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_amp_meta_optimizer,,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_parallel_dygraph_sparse_embedding_over_height,,GPU,150,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212 test_parallel_dygraph_sparse_embedding_over_height,,ROCM,350,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_distributed_strategy,LINUX;APPLE,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_distributed_strategy,LINUX;APPLE,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_auto_parallel_parallelizer,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_recompute_meta_optimizer,LINUX;WIN32,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_recompute_meta_optimizer,LINUX;WIN32,GPU;XPU;ASCEND;ASCEND_CL,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dygraph_group_sharded_api,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_fleet_private_function,LINUX;WIN32,,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_fleet_private_function,LINUX;WIN32,,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_new_group,,GPU;XPU;ASCEND;ASCEND_CL,,DIST,test_new_group.sh,2,,http_proxy=;https_proxy=, test_c_comm_init_op,LINUX,GPU;XPU;ASCEND;ASCEND_CL,120,DIST,test_c_comm_init_op.sh,2,,http_proxy=;https_proxy=, test_ir_pass_pipeline,,,120,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., @@ -67,9 +67,9 @@ test_parallel_dygraph_se_resnext,,GPU;ROCM,200,DIST,../../dist_test.sh,2,,http_p test_parallel_dygraph_sync_batch_norm,,GPU;ROCM,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_imperative_auto_mixed_precision,,GPU;ROCM,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_imperative_auto_mixed_precision_for_eager,,GPU;ROCM,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_mixed_precision,,GPU;ROCM,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_dygraph_recompute,,GPU;ROCM,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., -test_dygraph_recompute_for_eager,,GPU;ROCM,,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_mixed_precision,,GPU;ROCM,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_dygraph_recompute,,GPU;ROCM,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., +test_dygraph_recompute_for_eager,,GPU;ROCM,,,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., test_dist_mnist_dgc_nccl,,,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL OR WITH_RCCL;WITH_DGC test_dist_se_resnext_dgc,,,,DIST,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,WITH_NCCL OR WITH_RCCL;WITH_DGC test_auto_checkpoint,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../.., diff --git a/tools/gen_ut_cmakelists.py b/tools/gen_ut_cmakelists.py index 14f8e37626e75..4bfc58f640423 100644 --- a/tools/gen_ut_cmakelists.py +++ b/tools/gen_ut_cmakelists.py @@ -452,7 +452,6 @@ def _parse_line(self, line, curdir): archs = _proccess_archs(archs) os_ = _process_os(os_) run_serial = _process_run_serial(run_serial) - run_type = _process_run_type(run_type) cmd = "" @@ -460,6 +459,7 @@ def _parse_line(self, line, curdir): cmd += f"if ({c})\n" if launcher[-3:] == ".sh": + run_type = _process_run_type(run_type) dist_ut_port = self.port_manager.process_dist_port_num(num_port) dist_ut_port = self.port_manager.gset_port(name, dist_ut_port) cmd += f'''if({archs} AND {os_}) @@ -475,6 +475,12 @@ def _parse_line(self, line, curdir): ''' run_type_str = "" else: + try: + run_type = _process_run_type(run_type) + except Exception as e: + assert ( + run_type.strip() == "" + ), f"{e}\nIf use test_runner.py, the run_type can be ''" cmd += f'''if({archs} AND {os_}) py_test_modules( {name} @@ -493,7 +499,11 @@ def _parse_line(self, line, curdir): run_serial_str = ( f' RUN_SERIAL {run_serial}' if len(run_serial) > 0 else '' ) - if len(time_out_str) > 0 or len(run_serial_str) > 0: + if ( + len(time_out_str) > 0 + or len(run_serial_str) > 0 + or len(run_type_str) > 0 + ): set_properties = f''' set_tests_properties({name} PROPERTIES{time_out_str}{run_serial_str}{run_type_str})''' else: From 2b6bccc51a430cc46db3f8269d93b0ce56fc2e7f Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Mon, 31 Oct 2022 09:49:56 +0800 Subject: [PATCH 28/91] Fix the problem of printing log (#47474) --- tools/handle_h_cu_file.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/handle_h_cu_file.py b/tools/handle_h_cu_file.py index 84fba9f28202f..7cd29b39aa783 100644 --- a/tools/handle_h_cu_file.py +++ b/tools/handle_h_cu_file.py @@ -90,10 +90,14 @@ def get_h_cu_file(file_path): dir_path = file_path[1] filename = file_path[2] ut = filename.replace('^', '').replace('$', '').replace('.log', '') - os.system( - "cat %s/%s | grep 'precise test map fileeee:'| uniq >> %s/build/ut_map/%s/related_%s.txt" - % (dir_path, filename, rootPath, ut, ut) - ) + ut_path = "%s/build/ut_map/%s" % (rootPath, ut) + if os.path.exists(ut_path): + os.system( + "cat %s/%s | grep 'precise test map fileeee:'| uniq >> %s/build/ut_map/%s/related_%s.txt" + % (dir_path, filename, rootPath, ut, ut) + ) + else: + print("%s has failed,no has direcotory" % ut) def doFun(file_path): From 31b677bda2f4938806e54682cd65bd0c77b95f14 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Mon, 31 Oct 2022 09:56:51 +0800 Subject: [PATCH 29/91] apply new precise_card_test to coverage_ci (#47473) --- tools/get_pr_ut.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 990a0298b0c62..8962428ba0cab 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -303,7 +303,7 @@ def get_pr_ut(self): file_ut_map = None ret = self.__urlretrieve( - 'https://paddle-docker-tar.bj.bcebos.com/pre_test_tmp/ut_file_map.json', + 'https://paddle-docker-tar.bj.bcebos.com/tmp_test/ut_file_map.json', 'ut_file_map.json', ) if not ret: @@ -352,7 +352,7 @@ def get_pr_ut(self): if len(file_list) == 0: ut_list.append('filterfiles_placeholder') ret = self.__urlretrieve( - 'https://paddle-docker-tar.bj.bcebos.com/pre_test_tmp/prec_delta', + 'https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta', 'prec_delta', ) if ret: @@ -458,7 +458,7 @@ def get_pr_ut(self): else: if ut_list: ret = self.__urlretrieve( - 'https://paddle-docker-tar.bj.bcebos.com/pre_test_tmp/prec_delta', + 'https://paddle-docker-tar.bj.bcebos.com/tmp_test/prec_delta', 'prec_delta', ) if ret: From 1e2a371cbb52087d9e2b3b9641d4840e114aa957 Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Mon, 31 Oct 2022 09:58:37 +0800 Subject: [PATCH 30/91] repair log bugs that keeps printing warnings (#47467) --- .../contrib/slim/quantization/post_training_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 73d1e31159a6f..c959fc29bc918 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -476,7 +476,7 @@ def quantize(self): self._reset_activation_persistable() - if self._algo is 'min_max': + if self._algo == 'min_max': self._save_input_threhold() else: self._update_program() From 91096ae22ca91e8da1a90d652f2fa09f17ec7fe3 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Mon, 31 Oct 2022 10:32:04 +0800 Subject: [PATCH 31/91] remove boost compiler flags in flags.cmake (#47468) --- cmake/flags.cmake | 8 +------- paddle/fluid/operators/dgc_op.h | 2 +- paddle/fluid/operators/mlu/mlu_baseop.cc | 6 +++--- paddle/fluid/operators/optimizers/dgc_momentum_op.h | 2 +- paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc | 2 +- paddle/phi/kernels/cpu/layer_norm_kernel.cc | 2 +- paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/logsumexp_kernel_impl.h | 2 +- 8 files changed, 10 insertions(+), 16 deletions(-) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 0267f251e490f..6a0e1704bfa5d 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -170,13 +170,7 @@ if(NOT WIN32) if(NOT APPLE) if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM)) set(COMMON_FLAGS - ${COMMON_FLAGS} - -Wno-format-truncation # Warning in boost gcc 8.2 - -Wno-error=parentheses # Warning in boost gcc 8.2 - -Wno-error=catch-value # Warning in boost gcc 8.2 - -Wno-error=nonnull-compare # Warning in boost gcc 8.2 - -Wno-error=address # Warning in boost gcc 8.2 - -Wno-ignored-qualifiers # Warning in boost gcc 8.2 + ${COMMON_FLAGS} -Wno-ignored-qualifiers # Warning in Paddle-Lite -Wno-ignored-attributes # Warning in Eigen gcc 8.3 -Wno-parentheses # Warning in Eigen gcc 8.3 ) diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index 44121a9434c72..2757b41dd7c5c 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -68,7 +68,7 @@ class DGCOpKernel : public framework::OpKernel { // nranks auto nranks_tensor = ctx.Input("nranks"); - const int nranks = static_cast(*nranks_tensor->data()); + const int nranks = static_cast(*nranks_tensor->data()); PADDLE_ENFORCE_GT(nranks, 1, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 04e3063dd7087..d205bc2b2554d 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -1610,9 +1610,9 @@ MLURNNDesc::~MLURNNDesc() { const float alpha1_float, const float alpha2_float, const float beta_float) { - const int alpha1_int = static_cast(alpha1_float); - const int alpha2_int = static_cast(alpha2_float); - const int beta_int = static_cast(beta_float); + const int alpha1_int = static_cast(alpha1_float); + const int alpha2_int = static_cast(alpha2_float); + const int beta_int = static_cast(beta_float); const void* alpha1_ptr = static_cast(&alpha1_float); const void* alpha2_ptr = static_cast(&alpha2_float); diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h index 7cb3ed8e80efa..bf9c9ff1e96ba 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h @@ -39,7 +39,7 @@ class DGCMomentumKernel : public framework::OpKernel { // nranks auto nranks_tensor = context.Input("nranks"); - const int nranks = static_cast(*nranks_tensor->data()); + const int nranks = static_cast(*nranks_tensor->data()); PADDLE_ENFORCE_GT( nranks, 1, diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc index 68615a44e97c8..e250b5585da06 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc @@ -33,7 +33,7 @@ class XPULogsumexpKernel : public framework::OpKernel { const auto& input_dim_size = input->dims().size(); // The dims has full dim, set the reduce_all is True - reduce_all |= (static_cast(axis.size()) == input_dim_size); + reduce_all |= (static_cast(axis.size()) == input_dim_size); const T* input_data = input->data(); T* output_data = output->mutable_data(context.GetPlace()); diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc index dbc3da0ca15ac..7061d4f0ad730 100644 --- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc @@ -135,7 +135,7 @@ void LayerNormKernel(const Context& dev_ctx, scale ? scale->data() : nullptr, bias ? bias->data() : nullptr, static_cast(left), - static_cast(epsilon), + static_cast(epsilon), right); #endif } diff --git a/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h index 7e5b5ca4f8d4e..b7c1b2f9969a0 100644 --- a/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h @@ -61,7 +61,7 @@ void LogsumexpGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(in_grad); const auto input_dim_size = in.dims().size(); - reduce_all |= (static_cast(axis.size()) == input_dim_size); + reduce_all |= (static_cast(axis.size()) == input_dim_size); if (reduce_all) { auto x = phi::EigenVector::Flatten(in); diff --git a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h index 30a118a1317ba..7f61accaafd03 100644 --- a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h +++ b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h @@ -71,7 +71,7 @@ void LogsumexpKernel(const Context& dev_ctx, const auto& input_dim_size = x.dims().size(); // The dims has full dim, set the reduce_all is True - reduce_all |= (static_cast(axis.size()) == input_dim_size); + reduce_all |= (static_cast(axis.size()) == input_dim_size); if (reduce_all) { // Flatten and reduce 1-D tensor From d4b68dafd3ca54b0e16d5b23261b849cdc42395f Mon Sep 17 00:00:00 2001 From: YangZhou <56786796+SmileGoat@users.noreply.github.com> Date: Mon, 31 Oct 2022 10:43:40 +0800 Subject: [PATCH 32/91] [audio] rm kaiser window in audio get_window function && rm audio utils (#47469) * rm kaiser window in audio window function * rm paddle audio utils which is redundant * rm kaiser in test_audio_functions.py --- python/paddle/audio/functional/window.py | 14 ++------------ python/paddle/audio/utils/__init__.py | 13 ------------- python/paddle/audio/utils/error.py | 21 --------------------- python/paddle/tests/test_audio_functions.py | 7 ------- 4 files changed, 2 insertions(+), 53 deletions(-) delete mode 100644 python/paddle/audio/utils/__init__.py delete mode 100644 python/paddle/audio/utils/error.py diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py index 315d5a50a323f..472c56b87acf9 100644 --- a/python/paddle/audio/functional/window.py +++ b/python/paddle/audio/functional/window.py @@ -231,16 +231,6 @@ def _tukey( return _truncate(w, needs_trunc) -@window_function_register.register() -def _kaiser( - M: int, beta: float, sym: bool = True, dtype: str = 'float64' -) -> Tensor: - """Compute a Kaiser window. - The Kaiser window is a taper formed by using a Bessel function. - """ - raise NotImplementedError() - - @window_function_register.register() def _gaussian( M: int, std: float, sym: bool = True, dtype: str = 'float64' @@ -346,7 +336,7 @@ def get_window( """Return a window of a given length and type. Args: - window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. win_length (int): Number of samples. fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. dtype (str, optional): The data type of the return window. Defaults to 'float64'. @@ -363,7 +353,7 @@ def get_window( cosine_window = paddle.audio.functional.get_window('cosine', n_fft) std = 7 - gussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft) + gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft) """ sym = not fftbins diff --git a/python/paddle/audio/utils/__init__.py b/python/paddle/audio/utils/__init__.py deleted file mode 100644 index 55a55c3ed0a84..0000000000000 --- a/python/paddle/audio/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/paddle/audio/utils/error.py b/python/paddle/audio/utils/error.py deleted file mode 100644 index 244340b99b5ec..0000000000000 --- a/python/paddle/audio/utils/error.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ['ParameterError'] - - -class ParameterError(Exception): - """Exception class for Parameter checking""" - - pass diff --git a/python/paddle/tests/test_audio_functions.py b/python/paddle/tests/test_audio_functions.py index 5542b4483b62c..80a99343d8020 100644 --- a/python/paddle/tests/test_audio_functions.py +++ b/python/paddle/tests/test_audio_functions.py @@ -178,13 +178,6 @@ def test_gaussian_window_and_exception(self, n_fft: int): np.testing.assert_array_almost_equal( window_scipy_exp, window_paddle_exp.numpy(), decimal=5 ) - try: - window_paddle = paddle.audio.functional.get_window( - ("kaiser", 1.0), self.n_fft - ) - except NotImplementedError: - pass - try: window_paddle = paddle.audio.functional.get_window("hann", -1) except ValueError: From 81b93ebbc3c6e772eda9623b79c9d04365b88100 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Mon, 31 Oct 2022 10:58:56 +0800 Subject: [PATCH 33/91] fix python module not found bug (#47438) * fix python module not found bug * delete unused cast,test=allcases --- paddle/scripts/paddle_build.sh | 1 - python/paddle/fluid/tests/unittests/__init__.py | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7042fa3dec661..e29fd6549ee02 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2857,7 +2857,6 @@ function parallel_test() { cd ${PADDLE_ROOT}/build pip install hypothesis pip install ${PADDLE_ROOT}/build/python/dist/*whl - cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python ut_total_startTime_s=`date +%s` diff --git a/python/paddle/fluid/tests/unittests/__init__.py b/python/paddle/fluid/tests/unittests/__init__.py index 126cbeb493887..3a0e2ace56fec 100644 --- a/python/paddle/fluid/tests/unittests/__init__.py +++ b/python/paddle/fluid/tests/unittests/__init__.py @@ -18,9 +18,8 @@ import os -if os.name == 'nt': - import sys +import sys - dirname, filename = os.path.split(os.path.abspath(__file__)) - sys.path.insert(0, dirname) - print(sys.path) +dirname, filename = os.path.split(os.path.abspath(__file__)) +sys.path.insert(0, dirname) +print(sys.path) From c8fc33798a1a28354b90d78feac181000b96451b Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Mon, 31 Oct 2022 11:01:16 +0800 Subject: [PATCH 34/91] [Zero-Dim] support input 0D Tensor for reduce_sum/reduce_mean (#47219) --- paddle/phi/infermeta/unary.cc | 9 ++- paddle/phi/kernels/funcs/broadcast_function.h | 8 +- paddle/phi/kernels/funcs/reduce_function.h | 8 ++ .../kernels/gpu/reduce_mean_grad_kernel.cu | 31 +++++--- .../phi/kernels/gpu/reduce_sum_grad_kernel.cu | 36 ++++----- paddle/phi/kernels/reduce_mean_kernel.cc | 3 + python/paddle/fluid/layers/nn.py | 3 - .../fluid/tests/unittests/test_mean_op.py | 15 ++++ .../fluid/tests/unittests/test_reduce_op.py | 15 ++++ .../tests/unittests/test_zero_dim_shape.py | 51 ++++++++++++ python/paddle/tensor/math.py | 78 ++++++------------- python/paddle/tensor/stat.py | 19 +---- 12 files changed, 160 insertions(+), 116 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 3c66523aefffe..150da6d59b9ff 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -2685,7 +2685,7 @@ DDim ReduceInferDim(const MetaTensor& x, bool full_dim = true; std::set dims_set(formated_axis.begin(), formated_axis.end()); - for (int64_t i = 0; i < x.dims().size(); ++i) { + for (int64_t i = 0; i < x_rank; ++i) { if (dims_set.find(i) == dims_set.end()) { full_dim = false; break; @@ -2695,7 +2695,7 @@ DDim ReduceInferDim(const MetaTensor& x, std::vector out_dim_vector; if (keep_dim) { - for (int64_t i = 0; i < x.dims().size(); ++i) { + for (int64_t i = 0; i < x_rank; ++i) { if (reduce_all || dims_set.find(i) != dims_set.end()) { out_dim_vector.push_back(1); } else { @@ -2703,7 +2703,7 @@ DDim ReduceInferDim(const MetaTensor& x, } } } else { - for (int64_t i = 0; i < x.dims().size(); ++i) { + for (int64_t i = 0; i < x_rank; ++i) { if (reduce_all || dims_set.find(i) != dims_set.end()) { continue; } else { @@ -2711,7 +2711,7 @@ DDim ReduceInferDim(const MetaTensor& x, } } - if (out_dim_vector.size() == 0) { + if (x_rank > 0 && out_dim_vector.size() == 0) { out_dim_vector.push_back(1); } } @@ -3013,6 +3013,7 @@ void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) { phi::errors::InvalidArgument( "The rank of input should be less than 7, but received %d.", in_dims.size())); + out->set_dims(in_dims); } void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 59c3df0fce5e5..22ed5b29d77bc 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -44,7 +44,7 @@ struct DimensionsTransform { int64_t in_idx = 0; if (in_dim.size() < dim_size) { DimVector tmp_dim(dim_size, 1); - do { + for (; in_idx < in_dim.size();) { if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { tmp_dim[axis] = in_dim[in_idx]; in_idx++; @@ -59,11 +59,11 @@ struct DimensionsTransform { out_dims[axis], in_dim[in_idx])); } - } while (in_idx < in_dim.size()); + } in_dim.resize(dim_size); std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); } else { - do { + for (; in_idx < dim_size;) { if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { in_idx++; } else { @@ -76,7 +76,7 @@ struct DimensionsTransform { out_dims[in_idx], in_dim[in_idx])); } - } while (in_idx < dim_size); + } } std::reverse(in_dim.begin(), in_dim.end()); } diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 92fe3885b42f0..9138fd85e65aa 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -1063,6 +1063,14 @@ void ReduceKernel(const KPDevice& dev_ctx, dev_ctx.Alloc(y); auto x_dim = phi::vectorize(x.dims()); + + if (x_dim.size() == 0) { + std::vector inputs = {&x}; + std::vector outputs = {y}; + funcs::ElementwiseKernel(dev_ctx, inputs, &outputs, transform); + return; + } + auto config = ReduceConfig(origin_reduce_dims, x_dim); config.Run(dev_ctx); int numel = x.numel(); diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu index 7da2502a5eea7..40c317e1262c5 100644 --- a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu @@ -16,8 +16,8 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/gpu/reduce_grad.h" namespace phi { @@ -29,23 +29,34 @@ void ReduceMeanGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + // get reduce_dim and reduce_num for reduce_mean_grad int dim_size = x.dims().size(); + if (dims.size() == 0) { + reduce_all = true; + } std::vector reduce_dims = funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); + + auto update_dims = vectorize(x.dims()); int reduce_num = 1; for (auto i : reduce_dims) { reduce_num *= (x.dims())[i]; + update_dims[i] = 1; } + + // make new tensor + DenseTensor new_out_grad(out_grad.dtype()); + new_out_grad.ShareDataWith(out_grad); + new_out_grad.Resize(phi::make_ddim(update_dims)); + + // call BroadcastKernel + dev_ctx.Alloc(x_grad, x.dtype()); + std::vector inputs = {&new_out_grad}; + std::vector outputs = {x_grad}; + using MPType = typename kps::details::MPTypeTrait::Type; - ReduceGradKernel>( - dev_ctx, - x, - out_grad, - dims.GetData(), - keep_dim, - reduce_all, - x_grad, - kps::DivideFunctor(reduce_num)); + funcs::BroadcastKernel( + dev_ctx, inputs, &outputs, 0, kps::DivideFunctor(reduce_num)); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu index 2230b4b8525b3..74209afe37467 100644 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu @@ -29,42 +29,32 @@ void ReduceSumGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { - using MPType = typename kps::details::MPTypeTrait::Type; - auto out_dtype = x.dtype(); - auto* in_x = &x; - auto* d_out = &out_grad; - auto* d_x = x_grad; - - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); + // get reduce_dim for reduce_mean_grad + int dim_size = x.dims().size(); if (dims.size() == 0) { reduce_all = true; } std::vector reduce_dims = funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); - auto update_dims = vectorize(d_x->dims()); - int reduce_num = 1; + auto update_dims = vectorize(x.dims()); for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; update_dims[i] = 1; } + // make new tensor - DenseTensor new_d_out(d_out->dtype()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(phi::make_ddim(update_dims)); + DenseTensor new_out_grad(out_grad.dtype()); + new_out_grad.ShareDataWith(out_grad); + new_out_grad.Resize(phi::make_ddim(update_dims)); - dev_ctx.Alloc(d_x, x.dtype()); - auto pt_out_dtype = x.dtype(); - auto pt_d_out = new_d_out; - auto pt_d_x = *d_x; - std::vector inputs = {&pt_d_out}; - std::vector outputs = {&pt_d_x}; + // call ReduceGrad + dev_ctx.Alloc(x_grad, x.dtype()); + using MPType = typename kps::details::MPTypeTrait::Type; phi::ReduceGrad>( dev_ctx, - &pt_d_out, - &pt_d_x, - pt_out_dtype, + &new_out_grad, + x_grad, + x.dtype(), kps::IdentityFunctor()); } diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc index 375172fdb3733..aa615a6bb1ef1 100644 --- a/paddle/phi/kernels/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/reduce_mean_kernel.cc @@ -26,6 +26,9 @@ void MeanKernel(const Context& dev_ctx, bool keep_dim, DenseTensor* out) { bool reduce_all = false; + if (dims.size() == 0) { + reduce_all = true; + } MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9f7cbb1141193..4a5dbe4a106c2 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5096,9 +5096,6 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): fluid.layers.reduce_sum(y, dim=[0, 1]) # [16, 20] """ - if dim is not None and not isinstance(dim, list): - dim = [dim] - reduce_all, dim = _get_reduce_dim(dim, input) if in_dygraph_mode(): diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index ed9313b054696..68e88c9ba2a81 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -58,6 +58,21 @@ def test_checkout_grad(self): self.check_grad(['X'], 'Out', check_eager=True) +class TestMeanOp_ZeroDim(OpTest): + def setUp(self): + self.op_type = "mean" + self.python_api = paddle.mean + self.dtype = np.float64 + self.inputs = {'X': np.random.random([]).astype(self.dtype)} + self.outputs = {'Out': np.mean(self.inputs["X"])} + + def test_check_output(self): + self.check_output(check_eager=True) + + def test_checkout_grad(self): + self.check_grad(['X'], 'Out', check_eager=True) + + class TestMeanOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 8fa448a6927dd..bf0a968bdb1ff 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -37,6 +37,21 @@ def test_check_grad(self): self.check_grad(['X'], 'Out', check_eager=True) +class TestSumOp_ZeroDim(OpTest): + def setUp(self): + self.python_api = paddle.sum + self.op_type = "reduce_sum" + self.inputs = {'X': np.random.random([]).astype("float64")} + self.outputs = {'Out': self.inputs['X'].sum(axis=None)} + self.attrs = {'dim': [], 'reduce_all': True} + + def test_check_output(self): + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad(['X'], 'Out', check_eager=True) + + class TestSumOp_fp16(OpTest): def setUp(self): self.python_api = paddle.sum diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py b/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py index df4fa96d4a36c..0cab423aa7b98 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py @@ -17,6 +17,7 @@ import numpy as np import unittest + unary_api_list = [ paddle.nn.functional.elu, paddle.nn.functional.gelu, @@ -159,5 +160,55 @@ def test_static_unary(self): paddle.disable_static() +reduce_api_list = [ + paddle.sum, + paddle.mean, + paddle.nansum, + paddle.nanmean, +] + + +class TestReduceAPI(unittest.TestCase): + def test_dygraph(self): + paddle.disable_static() + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + for api in reduce_api_list: + x = paddle.rand([]) + x.stop_gradient = False + out = api(x, None) + out.backward() + + self.assertEqual(x.shape, []) + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + + paddle.enable_static() + + def test_static(self): + paddle.enable_static() + for api in reduce_api_list: + main_prog = fluid.Program() + with fluid.program_guard(main_prog, fluid.Program()): + x = paddle.rand([]) + + x.stop_gradient = False + out = api(x, None) + fluid.backward.append_backward(out) + + # Test compile shape, grad is always [1] + self.assertEqual(x.shape, ()) + self.assertEqual(out.shape, ()) + + exe = fluid.Executor() + result = exe.run(main_prog, fetch_list=[x, out]) + + # Test runtime shape + self.assertEqual(result[0].shape, ()) + self.assertEqual(result[1].shape, ()) + + paddle.disable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 91388a6f99a02..34bc3b006b3d9 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1265,22 +1265,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): out8 = paddle.sum(x, axis=0) # [1, 1, 1, 1] out9 = paddle.sum(x, axis=1) # [4, 0] """ - if isinstance(axis, Variable): - reduce_all_flag = True if axis.shape[0] == len(x.shape) else False - else: - if axis is not None and not isinstance(axis, (list, tuple)): - axis = [axis] - - if not axis: - axis = [] - - if len(axis) == 0: - reduce_all_flag = True - else: - if len(axis) == len(x.shape): - reduce_all_flag = True - else: - reduce_all_flag = False + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) dtype_flag = False if dtype is not None: @@ -1290,11 +1275,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): if in_dygraph_mode(): return _C_ops.sum(x, axis, dtype, keepdim) - if not isinstance(axis, Variable): - axis = axis if axis != None and axis != [] and axis != () else [0] - if utils._contain_var(axis): - axis = utils._convert_to_tensor_list(axis) - if _in_legacy_dygraph(): if dtype_flag: return _legacy_C_ops.reduce_sum( @@ -1304,7 +1284,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): 'keep_dim', keepdim, 'reduce_all', - reduce_all_flag, + reduce_all, 'in_dtype', x.dtype, 'out_dtype', @@ -1318,10 +1298,10 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): 'keep_dim', keepdim, 'reduce_all', - reduce_all_flag, + reduce_all, ) - attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all_flag} + attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all} if dtype_flag: attrs.update({'in_dtype': x.dtype, 'out_dtype': dtype}) @@ -2304,13 +2284,13 @@ def _check_input(x): return out -def _get_reduce_axis(axis): +def _get_reduce_axis(axis, x): """ Internal function for max, min, amax and amin. It computes the attribute reduce_all value based on axis. """ if axis is not None and not isinstance(axis, list): - if isinstance(axis, tuple): + if isinstance(axis, (tuple, range)): axis = list(axis) elif isinstance(axis, int): axis = [axis] @@ -2320,37 +2300,25 @@ def _get_reduce_axis(axis): type(axis) ) ) - reduce_all = True if axis == None or axis == [] else False - if axis == None: + if axis is None: axis = [] + if axis == [] or len(axis) == len(x.shape): + reduce_all = True + else: + reduce_all = False return reduce_all, axis -def _get_reduce_axis_with_tensor(axis): +def _get_reduce_axis_with_tensor(axis, x): if isinstance(axis, Variable): - return False, axis - return _get_reduce_axis(axis) - - -def _get_reduce_all_value(axis): - """ - Internal function for max, min, amax and amin. - It computes the attribute reduce_all value based on axis. - """ - if axis is not None and not isinstance(axis, list): - if isinstance(axis, tuple): - axis = list(axis) - elif isinstance(axis, int): - axis = [axis] + if axis.shape[0] == len(x.shape): + reduce_all = True else: - raise TypeError( - "The type of axis must be int, list or tuple, but received {}".format( - type(axis) - ) - ) - - reduce_all = True if axis == None or axis == [] else False - axis = axis if axis != None and axis != [] else [0] + reduce_all = False + else: + reduce_all, axis = _get_reduce_axis(axis, x) + if utils._contain_var(axis): + axis = utils._convert_to_tensor_list(axis) return reduce_all, axis @@ -2432,7 +2400,7 @@ def max(x, axis=None, keepdim=False, name=None): #[7., 8.], [[[0., 0.], [0., 0.]], [[0., 0.], [1., 1.]]] """ - reduce_all, axis = _get_reduce_axis_with_tensor(axis) + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if in_dygraph_mode(): return _C_ops.max(x, axis, keepdim) if _in_legacy_dygraph(): @@ -2534,7 +2502,7 @@ def min(x, axis=None, keepdim=False, name=None): #[1., 2.], [[[1., 1.], [0., 0.]], [[0., 0.], [0., 0.]]] """ - reduce_all, axis = _get_reduce_axis_with_tensor(axis) + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if in_dygraph_mode(): return _C_ops.min(x, axis, keepdim) @@ -2650,7 +2618,7 @@ def amax(x, axis=None, keepdim=False, name=None): #[0.9., 0.9], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]] """ - reduce_all, axis = _get_reduce_axis(axis) + reduce_all, axis = _get_reduce_axis(axis, x) if in_dygraph_mode(): return _C_ops.amax(x, axis, keepdim) if _in_legacy_dygraph(): @@ -2764,7 +2732,7 @@ def amin(x, axis=None, keepdim=False, name=None): #[0.1., 0.1], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]] """ - reduce_all, axis = _get_reduce_axis(axis) + reduce_all, axis = _get_reduce_axis(axis, x) if in_dygraph_mode(): return _C_ops.amin(x, axis, keepdim) elif _in_legacy_dygraph(): diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index 371e3fafd057e..ad061673ab9f4 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -20,9 +20,9 @@ from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode from .search import where from ..fluid.data_feeder import check_type, check_variable_and_dtype -from ..fluid.layers import utils import paddle from paddle import _C_ops, _legacy_C_ops +from .math import _get_reduce_axis_with_tensor __all__ = [] @@ -80,22 +80,9 @@ def mean(x, axis=None, keepdim=False, name=None): # [ 8.5 12.5 16.5] """ - if isinstance(axis, Variable): - reduce_all = True if axis.shape[0] == len(x.shape) else False - else: - if isinstance(axis, int): - axis = [axis] - reduce_all = ( - True - if axis is None or len(axis) == 0 or len(axis) == len(x.shape) - else False - ) - if axis is None or len(axis) == 0: - axis = [0] + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if in_dygraph_mode(): - if reduce_all: - axis = list(range(len(x.shape))) return _C_ops.mean(x, axis, keepdim) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_mean( @@ -122,8 +109,6 @@ def mean(x, axis=None, keepdim=False, name=None): helper = LayerHelper('mean', **locals()) - if not isinstance(axis, Variable) and utils._contain_var(axis): - axis = utils._convert_to_tensor_list(axis) attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all} out = helper.create_variable_for_type_inference(x.dtype) helper.append_op( From f5912d0c7ee3f73183e9801fd0bbcfe48a5d22e3 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Mon, 31 Oct 2022 11:15:50 +0800 Subject: [PATCH 35/91] fix typos for `True` and `False` (#47477) * fix typo `Fasle`/`Flase` -> `Flase` * fix typo `Ture` -> `True` --- paddle/fluid/framework/naive_executor.h | 2 +- paddle/fluid/memory/allocation/allocator_facade.cc | 2 +- paddle/fluid/operators/detection/yolo_box_op.cc | 2 +- .../fluid/operators/fused/fused_dropout_helper.h | 2 +- paddle/fluid/operators/select_op_helper.h | 2 +- .../fluid/operators/tensorrt/tensorrt_engine_op.h | 2 +- paddle/fluid/operators/unique_op.cc | 2 +- python/paddle/fluid/contrib/sparsity/asp.py | 2 +- .../dygraph/dygraph_to_static/convert_operators.py | 2 +- python/paddle/fluid/dygraph/nn.py | 2 +- python/paddle/fluid/layers/control_flow.py | 6 +++--- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/nn/layer/norm.py | 14 +++++++------- python/paddle/sparse/nn/layer/norm.py | 4 ++-- python/paddle/vision/ops.py | 2 +- 15 files changed, 24 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 02b2249dea569..8ca3f5997af46 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -53,7 +53,7 @@ class NaiveExecutor { bool with_feed_fetch_ops); // Create variables before head. - // Create parameters if persistable is ture, or create the temporary variables + // Create parameters if persistable is true, or create the temporary variables // instead. void CreateVariables(const ProgramDesc& desc, int block_id, diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index fb60d9110af38..6bd08767871cc 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -238,7 +238,7 @@ class AllocatorFacadePrivate { // releate to non-default stream (i.e., the stream users pass in). The // default stream Allocator is built in the structure of // AllocatorFacadePrivate, while the non-default stream is build in a - // manner in GetAllocator function with 'create_if_not_found = ture'. + // manner in GetAllocator function with 'create_if_not_found = true'. // We make special treatment for the default stream for performance // reasons. Since most Alloc calls are for default stream in // application, treating it separately can avoid lots of overhead of diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 257347f663c68..fbf4b55dfe44e 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -237,7 +237,7 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { .. math:: score_{conf} = \begin{case} - obj, \text{if } iou_aware == flase \\ + obj, \text{if } iou_aware == false \\ obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise} \end{case} diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index 5d6dd1a5bbf81..3230854284062 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -28,7 +28,7 @@ namespace operators { * Support two Dropouts in the use senarieo. * This warpper can be used in FFN op. * The DropoutParam will be used in the fused_dropout_act_bias, - * fused_residual_dropout_bias(pre_layer_norm=ture) or + * fused_residual_dropout_bias(pre_layer_norm=true) or * fused_layernorm_residual_dropout_bias(pre_layer_norm=false). */ struct DropoutParam { diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h index 46ef90c1a9219..ffab83e4e74fa 100644 --- a/paddle/fluid/operators/select_op_helper.h +++ b/paddle/fluid/operators/select_op_helper.h @@ -37,7 +37,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) { if (platform::is_cpu_place(mask.place())) { return mask.data()[0]; } - // when platform::is_gpu_place(mask.place()) is ture + // when platform::is_gpu_place(mask.place()) is true std::unique_ptr cpu_mask{new phi::DenseTensor()}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index eea337d93fb7e..8096acc0a821c 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -269,7 +269,7 @@ class TensorRTEngineOp : public framework::OperatorBase { if (param_names_.count(x)) continue; runtime_input_names_.emplace_back(x); } - // calibration_mode is ture represents we need to + // calibration_mode is true represents we need to // generate the calibration table data. calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_); diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc index 4d772e50e6525..c99f60ca873b1 100644 --- a/paddle/fluid/operators/unique_op.cc +++ b/paddle/fluid/operators/unique_op.cc @@ -119,7 +119,7 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "Input tensor. It should be a 1-D tensor when Attr(is_sorted)" - " is fasle or a N-D tensor when Attr(is_sorted) is true."); + " is false or a N-D tensor when Attr(is_sorted) is true."); AddAttr("dtype", "data type for output index"); AddOutput("Out", "A unique subsequence for input tensor."); AddOutput("Index", diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index d770bd36e3980..b10f3261324ec 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -322,7 +322,7 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True): m (int, optional): m of `n:m` sparse pattern. Default is 4. mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`. The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'. - with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Default is True. + with_mask (bool, optional): To prune mask Variables related to parameters or not. True is purning also, False is not. Default is True. Returns: dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable. Examples: diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index 0fa48c4260c46..abf9c48828039 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -373,7 +373,7 @@ def _run_paddle_cond( pred, true_fn, false_fn, get_args, set_args, return_name_ids, push_pop_names ): """ - Paddle cond API will evaluate both ture_fn and false_fn codes. + Paddle cond API will evaluate both true_fn and false_fn codes. """ helper = GetterSetterHelper( get_args, set_args, return_name_ids, push_pop_names diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 60202c2a6b105..f93a031f7bc13 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -1268,7 +1268,7 @@ def __init__( if param_attr == False or bias_attr == False: assert ( bias_attr == param_attr - ), "param_attr and bias_attr must be set to Fasle at the same time in InstanceNorm" + ), "param_attr and bias_attr must be set to False at the same time in InstanceNorm" self._epsilon = epsilon self._param_attr = param_attr self._bias_attr = bias_attr diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 18b2ec496499d..5b79e3b86fadf 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -207,7 +207,7 @@ def select_input_with_buildin_type(inputs, mask, name): inputs = [to_static_variable(false_var), to_static_variable(true_var)] warnings.warn( "Return results from different branches in cond are not same type: " - "false_var returned by fasle_fn is '{}' and true_var of true_fn is " + "false_var returned by false_fn is '{}' and true_var of true_fn is " "'{}'".format(type(false_var), type(true_var)) ) elif ( @@ -230,7 +230,7 @@ def create_var_if_not_undefined_var(a): else: raise TypeError( "Unsupported return type of true_fn and false_fn in cond: false_var " - "returned by fasle_fn is '{}' and true_var of true_fn is '{}'".format( + "returned by false_fn is '{}' and true_var of true_fn is '{}'".format( type(false_var), type(true_var) ) ) @@ -2835,7 +2835,7 @@ def false_func(): "true_fn returns non-None while false_fn returns None" ) - # Merge ture and false output if they are not None + # Merge true and false output if they are not None if return_names is None: is_dy2staic = False return_names = ["no name"] * len(_to_sequence_except_dict(true_output)) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4a5dbe4a106c2..525558cb77b79 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3678,7 +3678,7 @@ def instance_norm( if param_attr is False: assert ( bias_attr is False - ), "param_attr and bias_attr must be set to Fasle at the same time in instance_norm" + ), "param_attr and bias_attr must be set to False at the same time in instance_norm" helper = LayerHelper('instance_norm', **locals()) dtype = helper.input_dtype() diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 1fca251c57007..1b5784fbedff1 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -74,7 +74,7 @@ def __init__( if weight_attr == False or bias_attr == False: assert ( weight_attr == bias_attr - ), "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm" + ), "weight_attr and bias_attr must be set to False at the same time in InstanceNorm" self._epsilon = epsilon self._weight_attr = weight_attr self._bias_attr = bias_attr @@ -779,11 +779,11 @@ class BatchNorm1D(_BatchNormBase): momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as weight_attr. If it is set to False, the weight is not learnable. If the Initializer of the weight_attr is not set, the parameter is initialized with ones. Default: None. bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL". use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None. @@ -892,11 +892,11 @@ class BatchNorm2D(_BatchNormBase): momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as weight_attr. If it is set to False, the weight is not learnable. If the Initializer of the weight_attr is not set, the parameter is initialized with ones. Default: None. bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW. use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None. @@ -978,11 +978,11 @@ class BatchNorm3D(_BatchNormBase): momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as weight_attr. If it is set to False, the weight is not learnable. If the Initializer of the weight_attr is not set, the parameter is initialized with ones. Default: None. bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC. Default: NCDHW. use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None. diff --git a/python/paddle/sparse/nn/layer/norm.py b/python/paddle/sparse/nn/layer/norm.py index 8bbad41ef38c1..936e43a18faf9 100644 --- a/python/paddle/sparse/nn/layer/norm.py +++ b/python/paddle/sparse/nn/layer/norm.py @@ -61,11 +61,11 @@ class BatchNorm(paddle.nn.BatchNorm1D): epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as weight_attr. If it is set to False, the weight is not learnable. If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm - will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable. + will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL". use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None. diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index bdb8bc4ee983a..519ac1db4c681 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -347,7 +347,7 @@ def yolo_box( .. math:: score_{conf} = \begin{case} - obj, \text{if } iou_aware == flase \\ + obj, \text{if } iou_aware == false \\ obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise} \end{case} From bb6356e884a24a5d583b9053be02e97d135eb865 Mon Sep 17 00:00:00 2001 From: Chenxiao Niu Date: Mon, 31 Oct 2022 11:32:27 +0800 Subject: [PATCH 36/91] [MLU] fix compile error & add mlu blacklist function. (#47439) --- paddle/fluid/imperative/prepared_operator.cc | 48 +++++++++++++++++++ .../fluid/operators/strided_slice_op_mlu.cc | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index c6cc9befbd846..d76e06bd4143e 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -149,6 +149,48 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, kernel_signature_(std::move(kernel_signature)), phi_kernel_(phi_kernel) {} +#ifdef PADDLE_WITH_MLU + +static void tokenize(const std::string& ops, + char delim, + std::unordered_set* op_set) { + std::string::size_type beg = 0; + for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos; + ++end) { + op_set->insert(ops.substr(beg, end - beg)); + beg = end + 1; + } + + op_set->insert(ops.substr(beg)); +} + +static bool is_in_mlu_black_list(const std::string& op_name) { + static bool inited = false; + static std::unordered_set mlu_black_list; + static std::mutex s_mtx; + if (!inited) { + std::lock_guard guard(s_mtx); + if (!inited) { + if (std::getenv("MLU_BLACK_LIST") != nullptr) { + std::string ops(std::getenv("MLU_BLACK_LIST")); + tokenize(ops, ',', &mlu_black_list); + } + inited = true; + VLOG(3) << "MLU Black List: "; + for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end(); + ++iter) { + VLOG(3) << *iter << " "; + } + } + } + if (mlu_black_list.find(op_name) != mlu_black_list.end()) { + return true; + } + return false; +} + +#endif + template PreparedOp PrepareImpl( const NameVarMap& ins, @@ -212,6 +254,12 @@ PreparedOp PrepareImpl( paddle::platform::is_in_xpu_black_list(op.Type()); #endif +#ifdef PADDLE_WITH_MLU + if (is_in_mlu_black_list(op.Type())) { + expected_kernel_key.place_ = platform::CPUPlace(); + } +#endif + bool has_phi_kernel = false; const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type()); diff --git a/paddle/fluid/operators/strided_slice_op_mlu.cc b/paddle/fluid/operators/strided_slice_op_mlu.cc index 5800c167b0158..6caf1ad5ad15f 100644 --- a/paddle/fluid/operators/strided_slice_op_mlu.cc +++ b/paddle/fluid/operators/strided_slice_op_mlu.cc @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using Tensor = phi::DenseTensor; using Variable = framework::Variable; using LoDTensorArray = framework::LoDTensorArray; using DDim = framework::DDim; From 3b219e5ea393a65622750e705f978113a40b0f2e Mon Sep 17 00:00:00 2001 From: kangguangli Date: Mon, 31 Oct 2022 11:53:55 +0800 Subject: [PATCH 37/91] [ControlFlow] replace executor in run method of control flow ops with standalone_executor (#45696) * replace executor in conditional_block_op.run with standalone_executor * add block_id as the argument of standalone executor's method run; add print for program * fix scope bug about conditional block op * fix bug: unnecessary return of fetch value * fix typo * fix: quantization will set variable persistable, and these variables must exist in global scope * add interpretercore cache for conditional block op but not activate in default * fix bug: local scope reuse for conditional block op * reset scope when conditional block op runs * fix typo * fix typo and code style * add build scope for conditional block op * add skip for transfer_layout kernel * refind code * fix reset_scope * fix reset_scope * refine code * refine code * refine code 1. remove flag use in conditional_block_op 2. pass execution_config to BuildOpFuncList instead of individual parameter * refine code * remove the use of FLAGS_control_flow_use_new_executor_cache * change FLAGS_control_flow_use_new_executor to false --- .../interpreter/execution_config.h | 1 + .../interpreter/interpreter_util.cc | 39 +++-- .../interpreter/interpreter_util.h | 5 +- .../framework/new_executor/interpretercore.cc | 69 +++++--- .../framework/new_executor/interpretercore.h | 11 +- .../new_executor/new_executor_defs.h | 1 + .../new_executor/standalone_executor.cc | 2 +- .../operators/controlflow/CMakeLists.txt | 4 +- .../controlflow/conditional_block_op.cc | 158 +++++++++++++++--- paddle/phi/kernels/transfer_layout_kernel.cc | 8 + 10 files changed, 232 insertions(+), 66 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.h b/paddle/fluid/framework/new_executor/interpreter/execution_config.h index ec7b48c65becd..3721766700af5 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.h +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.h @@ -26,6 +26,7 @@ namespace interpreter { struct ExecutionConfig { bool used_for_jit{false}; bool create_local_scope{true}; + bool used_for_control_flow_op{false}; size_t host_num_threads; size_t deivce_num_threads; diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index a2979efc4594d..ae646ed42dbcc 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h" +#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" #include "paddle/fluid/memory/stats.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" @@ -227,7 +228,14 @@ void BuildVariableScope(const framework::BlockDesc& block, } if (var_desc->Persistable()) { - auto* ptr = inner_scope->Var(var_name); + // In principle, we should put all trainable parameters in global scope, + // which means the root of the scope tree. Some cases like quantization + // will look up these parameters in global scope. + const Scope* ancestor_scope = inner_scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + auto* ptr = const_cast(ancestor_scope)->Var(var_name); VLOG(3) << "Initialize Variable " << var_name; // NOTE(zhiqiu): if var exists in scope and the type is right, @@ -291,7 +299,7 @@ std::tuple BuildVariableMap( const VariableNameMap& var_name_map, VariableScope* var_scope, Scope* local_scope, - bool allow_var_not_in_program = false, + bool find_var_recursively = false, bool allow_var_not_in_scope = false) { VariableValueMap name2var; VariableIdMap name2id; @@ -301,8 +309,10 @@ std::tuple BuildVariableMap( vars.reserve(item.second.size()); for (auto& var_name : item.second) { + auto* var = local_scope->FindVar(var_name); + if (!var_scope->HasVar(var_name)) { - if (allow_var_not_in_program && local_scope->FindVar(var_name)) { + if (find_var_recursively && var) { VLOG(3) << "Add " << var_name << " to var_scope"; var_scope->AddVar(var_name, nullptr); } else if (allow_var_not_in_scope) { @@ -310,7 +320,6 @@ std::tuple BuildVariableMap( continue; } } - auto* var = local_scope->FindVar(var_name); auto var_id = var_scope->VarId(var_name); vars.push_back(var); ids.push_back(var_id); @@ -419,8 +428,8 @@ void BuildOpFuncList(const platform::Place& place, const std::set& skip_gc_vars, std::vector* vec_func_list, VariableScope* var_scope, - bool use_local_scope, - bool used_for_jit) { + const ExecutionConfig& execution_config, + bool use_local_scope) { Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope() : var_scope->GetMutableScope(); std::vector> @@ -428,7 +437,7 @@ void BuildOpFuncList(const platform::Place& place, // Step 1: create all ops for current block. CreateAllOps(block, &ops_unique); - if (!used_for_jit) { + if (!execution_config.used_for_jit) { // If gc is enabled and block size > 1 const ProgramDesc& main_program = *block.Program(); operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( @@ -479,14 +488,18 @@ void BuildOpFuncList(const platform::Place& place, bool allow_var_not_in_program = ops_with_var_not_in_program.count(op_type); bool allow_var_not_in_scope = ops_with_var_not_in_scope.count(op_type); + // ops in the control flow block may not find its inputs or outputs + // in VariableScope of the sub-block, so we need search it in parent scope. + framework::VariableNameMap& input_name_map = op->Inputs(); VariableValueMap ins_map; VariableIdMap ins_name2id; - std::tie(ins_map, ins_name2id) = BuildVariableMap(input_name_map, - var_scope, - local_scope, - allow_var_not_in_program, - allow_var_not_in_scope); + std::tie(ins_map, ins_name2id) = BuildVariableMap( + input_name_map, + var_scope, + local_scope, + execution_config.used_for_control_flow_op || allow_var_not_in_program, + allow_var_not_in_scope); framework::VariableNameMap& output_name_map = op->Outputs(); VariableValueMap outs_map; @@ -495,7 +508,7 @@ void BuildOpFuncList(const platform::Place& place, BuildVariableMap(output_name_map, var_scope, local_scope, - /*allow_var_not_in_program=*/false, + execution_config.used_for_control_flow_op, allow_var_not_in_scope); // step 1: build OpFuncNode diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index 7bd82a5dd5f74..52163c64f7ea8 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -24,6 +24,7 @@ #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h" #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" @@ -75,8 +76,8 @@ void BuildOpFuncList(const platform::Place& place, const std::set& skip_gc_vars, std::vector* vec_func_list, VariableScope* scope, - bool use_local_scope = true, - bool used_for_jit = false); + const ExecutionConfig& execution_config, + bool use_local_scope = true); void AddFetch(const std::vector& fetch_names, framework::BlockDesc* block); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 3eb4db03f7394..230e333458dd4 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -16,6 +16,8 @@ #include +#include "gflags/gflags.h" + #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" @@ -47,6 +49,9 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); +PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor, + false, + "Use new executor in control flow op"); DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); @@ -107,7 +112,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place, const BlockDesc& block, const std::set& skip_gc_vars, framework::Scope* scope, - bool used_for_jit) + bool used_for_jit, + bool used_for_control_flow_op) : place_(place), block_(block), execution_config_(place, block.OpSize()), @@ -119,8 +125,10 @@ InterpreterCore::InterpreterCore(const platform::Place& place, completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion); execution_config_.used_for_jit = used_for_jit; - execution_config_.create_local_scope = - !used_for_jit && FLAGS_new_executor_use_local_scope; + execution_config_.used_for_control_flow_op = used_for_control_flow_op; + execution_config_.create_local_scope = !used_for_jit && + FLAGS_new_executor_use_local_scope && + !used_for_control_flow_op; execution_config_.skip_gc_vars = skip_gc_vars; execution_config_.Log(/*log_level=*/8); @@ -224,7 +232,7 @@ paddle::framework::FetchList InterpreterCore::Run( } paddle::framework::FetchList InterpreterCore::Run( - const std::vector& feed_names) { + const std::vector& feed_names, bool need_fetch) { SetDeviceId(place_); #ifdef PADDLE_WITH_MKLDNN @@ -243,12 +251,12 @@ paddle::framework::FetchList InterpreterCore::Run( execution_config_.skip_gc_vars, &op_func_nodes, &var_scope_, - HasLocalScope(), - execution_config_.used_for_jit); - is_build_ = true; + execution_config_, + HasLocalScope()); SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph Convert(&op_func_nodes); + is_build_ = true; } else { // For the program that only run once, it is no need to // create work_queue, so the async_work_queue_ is created @@ -281,7 +289,7 @@ paddle::framework::FetchList InterpreterCore::Run( Scope* inner_scope = HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope(); auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); - if (fetch_var) { + if (fetch_var && need_fetch) { return std::move(*fetch_var->GetMutable()); } else { return {}; @@ -311,9 +319,18 @@ void InterpreterCore::reset_scope(Scope* new_scope) { var_scope_.SetScope(new_scope); auto& var_list = var_scope_.MutableVarList(); for (size_t i = 0; i < var_list.size(); i++) { - var_list[i] = new_scope->FindVar(var_scope_.GetNameById(i)); + const auto& var_name = var_scope_.GetNameById(i); + var_list[i] = new_scope->FindVar(var_name); } - for (size_t i = 0; i < vec_instruction_.size(); ++i) { + // The index should assured valid, cause the InterpreterCore may not be fully + // built, but was still cached and used. For example, see unit test + // `test_assert.py`, it may exit before `InterpreterCore::Convert`, but still + // was cached and used by later tests. + for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) { + refs_[i]->ResetVariable(var_list[i]); + } + + for (size_t i = 0; i < vec_instruction_.size(); i++) { BuildAndCacheInstructionCtx(&vec_instruction_[i]); } } @@ -540,6 +557,10 @@ void InterpreterCore::Convert( if (var_desc && ins.count(item.first) && !info.IsInArgBufferNeeded(var_desc->Name())) { continue; + } else if (!block_.HasVar(var_scope_.GetNameById(id))) { + VLOG(10) << "[gc_check_inputs] skip gc: " + << var_scope_.GetNameById(id); + continue; } gc_check_vars.insert(id); } @@ -661,9 +682,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(place)) { - // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable - // values, but only through special `float_status` to checks whether - // the operation is overflow. More about `float_status`, see: + // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the + // variable values, but only through special `float_status` to checks + // whether the operation is overflow. More about `float_status`, see: // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue if (FLAGS_check_nan_inf) { framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); @@ -734,7 +755,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { } } - VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_); + VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope); if (!instr_node.InplaceBackMap().empty()) { platform::RecordEvent inplaceback_event( @@ -965,9 +986,9 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { if (platform::is_gpu_place(place)) { memory::RecordStream(allocation, stream); } else if (platform::is_cuda_pinned_place(place)) { - // TODO(Ruibiao): Here should do something to make sure that the tensor is - // not freed until the H2D copies done. However, simplely launch a CUDA - // runtime callback to the H2D stream may lead a high performance + // TODO(Ruibiao): Here should do something to make sure that the tensor + // is not freed until the H2D copies done. However, simplely launch a + // CUDA runtime callback to the H2D stream may lead a high performance // overhead. As all the cases we meet in H2D are copies from CPUPlace at // present, we just log a WARNING here. A better design is required. LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous " @@ -984,8 +1005,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { * instr.GCCheckVars. * 2. The stream which initializes this tensor is different from the stream * which the instruction run in. - * 3. The tensor is the instruction's input, cause we assume that instruction - * will initialize all output tensors with its running stream. + * 3. The tensor is the instruction's input, cause we assume that + * instruction will initialize all output tensors with its running stream. * 4. In the OP function of this instruction, the tensor is an input of a * async CUDA kernel. * @@ -995,8 +1016,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { * initialized this tensor has less time overhead. Conversely, it may take * more time if we try to extract those cross-stream input vars from * instr.GCCheckVars. - * 2. Now the instruction has no idea of which vars involving async running in - * OP function, and thus we can not recognize condition 4. It should be + * 2. Now the instruction has no idea of which vars involving async running + * in OP function, and thus we can not recognize condition 4. It should be * supported later. */ for (int var_id : instr.GCCheckVars()) { @@ -1099,12 +1120,12 @@ void InterpreterCore::Prepare(const std::vector& feed_names, execution_config_.skip_gc_vars, &op_func_nodes, &var_scope_, - HasLocalScope(), - execution_config_.used_for_jit); - is_build_ = true; + execution_config_, + HasLocalScope()); SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph Convert(&op_func_nodes); + is_build_ = true; } // NOTE: Because feed_tensor will be GC after // paddle::framework::BuildOpFuncList, so we should diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 530c8e9d04b2b..ff89f5ed731de 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -34,6 +34,9 @@ #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/device_event.h" +DECLARE_bool(new_executor_use_local_scope); +DECLARE_bool(control_flow_use_new_executor); + namespace paddle { namespace framework { @@ -43,7 +46,8 @@ class InterpreterCore { const BlockDesc& block, const std::set& skip_gc_vars, Scope* scope, - bool used_for_jit = false); + bool used_for_jit = false, + bool used_for_control_flow_op = false); ~InterpreterCore(); @@ -55,7 +59,8 @@ class InterpreterCore { const std::vector& feed_names, const std::vector& feed_tensors); - paddle::framework::FetchList Run(const std::vector& feed_names); + paddle::framework::FetchList Run(const std::vector& feed_names, + bool need_fetch = true); void ShareWorkQueueFrom(std::shared_ptr src); @@ -67,6 +72,8 @@ class InterpreterCore { void reset_scope(Scope* new_scope); + const platform::Place& GetPlace() const { return place_; } + private: // build graph void Convert(std::vector* op_func_nodes); diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 7fced920a77f0..6f2287a896645 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -418,6 +418,7 @@ class VarRefInfo { dynamic_ref_ = static_ref_; } } + void ResetVariable(Variable* new_var) { var_ = new_var; } bool CheckAndDecrease() { return static_ref_ == 1 || (dynamic_ref_.fetch_sub(1) == 1); } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 2fe686b808114..4cc3b5d0f23e3 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -28,8 +28,8 @@ paddle::framework::FetchList StandaloneExecutor::Run( const std::vector& fetch_names) { platform::RecordEvent record_event( "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); - auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false); + VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; return core->Run(feed_names); } diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 193c5c4505641..07d72297b2b70 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -8,7 +8,7 @@ register_operators(EXCLUDES conditional_block_op DEPS naive_executor) cc_library( conditional_block_op SRCS conditional_block_op.cc - DEPS executor) + DEPS standalone_executor executor) cc_library( op_variant SRCS op_variant.cc @@ -29,7 +29,7 @@ cc_library( cc_test( conditional_block_op_test SRCS conditional_block_op_test.cc - DEPS conditional_block_op executor) + DEPS conditional_block_op standalone_executor executor) if(WITH_UNITY_BUILD) target_link_libraries(paddle_operators_controlflow_unity conditional_block_op) diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 6d32a341d29f7..d441a84bc6387 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/controlflow/conditional_block_op.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/operators/assign_op.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/phi/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_MKLDNN @@ -35,6 +37,45 @@ const char ConditionalOp::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; using Executor = framework::Executor; using ExecutorPrepareContext = framework::ExecutorPrepareContext; +using InterpreterCore = framework::InterpreterCore; + +namespace details { +static void BuildScopeForConditionalBlockOp( + const paddle::framework::InterpreterCore &interpreter_core, + const paddle::framework::BlockDesc &block, + paddle::framework::Scope *scope) { + for (auto &var_desc : block.AllVars()) { + auto var_name = var_desc->Name(); + if (var_name == framework::kEmptyVarName) { + continue; + } + VLOG(5) << "[BuildScopeForConditionalBlockOp]" + << "start:" << var_name; + if (var_desc->Persistable()) { + VLOG(5) << "[BuildScopeForConditionalBlockOp]" + << "Don't process persistent: " << var_name; + } else { + auto *ptr = scope->Var(var_name); + InitializeVariable(ptr, var_desc->GetType()); + VLOG(5) << "[BuildScopeForConditionalBlockOp]" + << "Not Found locally and created: " << var_name; + } + } + + auto &data_transfer_added_vars = + interpreter_core.GetVariableScope()->DataTransferAddedVars(); + for (size_t i = 0; i < data_transfer_added_vars.size(); i++) { + auto *ptr = scope->Var(data_transfer_added_vars[i].first); + InitializeVariable(ptr, + static_cast( + data_transfer_added_vars[i].second)); + VLOG(10) << "[BuildScopeForConditionalBlockOp]" + << "Initialize Transfer Added Variable " + << data_transfer_added_vars[i].first; + } +} +} // namespace details + class ConditionalBlockOp : public ConditionalOp { public: ConditionalBlockOp(const std::string &type, @@ -71,9 +112,20 @@ class ConditionalBlockOp : public ConditionalOp { platform::errors::PreconditionNotMet( "Expect Scope variable to be set in conditional_block_op, but " "got a null Scope variable. Please set the Scope variable.")); + auto *scopes = scope_var->GetMutable>(); - scopes->resize(1); - scopes->front() = &scope.NewScope(); + + if (scopes->size() == 0 || !FLAGS_control_flow_use_new_executor) { + scopes->resize(1); + scopes->front() = &scope.NewScope(); + } + + // We need to know whether the scope we cached is still valid. + // If not, we need to create a new one. + if (scope.kids().size() == 0) { + scopes->front() = &scope.NewScope(); + } + auto &cur_scope = *scopes->front(); #ifdef PADDLE_WITH_MKLDNN // (jczaja) Executor on being destroyed clears oneDNN cache and @@ -84,25 +136,56 @@ class ConditionalBlockOp : public ConditionalOp { auto *block = Attr("sub_block"); VLOG(3) << "Conditional block.idx = " << block->ID() << ", scope = " << &cur_scope; + auto &skip_vars = Attr>(ConditionalOp::kSkipEagerDeletionVars); - if (!exec || !platform::is_same_place(exec->GetPlace(), dev_place)) { - auto &pdesc = *block->Program(); - exec.reset(new Executor(dev_place)); - if (FLAGS_use_mkldnn) exec->EnableMKLDNN(pdesc); - ctx = exec->Prepare(pdesc, block->ID(), skip_vars, false); + + if (FLAGS_control_flow_use_new_executor) { + std::set skip_gc_vars(skip_vars.begin(), skip_vars.end()); + + if (!core || !platform::is_same_place(core->GetPlace(), dev_place)) { + VLOG(10) << "[interpreterCore cache]" << core.get(); + VLOG_IF(10, core) + << platform::is_same_place(core->GetPlace(), dev_place); + core.reset(new InterpreterCore(dev_place, + *block, + skip_gc_vars, + &cur_scope, + /* used_for_jit */ false, + /* used_for_control_flow_op */ true)); + VLOG(10) << "[interpreterCore cache]" + << "new created:" << core; + } else { + details::BuildScopeForConditionalBlockOp(*core, *block, &cur_scope); + core->reset_scope(&cur_scope); + } + + core->Run({}, false); + + } else { + if (!exec || !platform::is_same_place(exec->GetPlace(), dev_place)) { + auto &pdesc = *block->Program(); + exec.reset(new Executor(dev_place)); + if (FLAGS_use_mkldnn) exec->EnableMKLDNN(pdesc); + ctx = exec->Prepare(pdesc, block->ID(), skip_vars, false); #ifdef PADDLE_WITH_MKLDNN - platform::AttachPointerHashToMKLDNNKey(exec.get(), dev_place); - platform::RegisterModelLayout(ctx->ops_, dev_place); + platform::AttachPointerHashToMKLDNNKey(exec.get(), dev_place); + platform::RegisterModelLayout(ctx->ops_, dev_place); #endif + } + exec->RunPreparedContext(ctx.get(), + &cur_scope, + /* create_local_scope */ false, + /* create_vars */ true, + /* keep_kids */ true); } - exec->RunPreparedContext(ctx.get(), &cur_scope, false, true, true); } } private: mutable std::shared_ptr exec{nullptr}; mutable std::unique_ptr ctx{nullptr}; + mutable std::shared_ptr core{nullptr}; }; class ConditionalBlockInferShape : public framework::InferShapeBase { @@ -161,23 +244,51 @@ class ConditionalBlockGradOp : public ConditionalOp { platform::errors::InvalidArgument( "Expect Scope variable contains at least 1 scope, but got: %d", scopes.size())); - framework::Scope &cur_scope = *scopes[0]; + framework::Scope &cur_scope = *(scopes[0]); auto *block = Attr("sub_block"); - VLOG(3) << "Conditional Grad block.idx = " << block->ID() << ", scope = " << &cur_scope; - if (!exec || !platform::is_same_place(exec->GetPlace(), dev_place)) { - auto &pdesc = *block->Program(); - exec.reset(new Executor(dev_place)); - if (FLAGS_use_mkldnn) exec->EnableMKLDNN(pdesc); - ctx = exec->Prepare(pdesc, block->ID(), inside_grads, false); + + if (FLAGS_control_flow_use_new_executor) { + std::set skip_gc_vars(inside_grads.begin(), + inside_grads.end()); + + if (!core || !platform::is_same_place(core->GetPlace(), dev_place)) { + VLOG(10) << "[interpreterCore cache]" << core.get(); + VLOG_IF(10, core) + << platform::is_same_place(core->GetPlace(), dev_place); + core.reset(new InterpreterCore(dev_place, + *block, + skip_gc_vars, + &cur_scope, + /* used_for_jit */ false, + /* used_for_control_flow_op */ true)); + VLOG(10) << "[interpreterCore cache]" + << "new created:" << core; + } else { + details::BuildScopeForConditionalBlockOp(*core, *block, &cur_scope); + core->reset_scope(&cur_scope); + } + core->Run({}, false); + + } else { + if (!exec || !platform::is_same_place(exec->GetPlace(), dev_place)) { + auto &pdesc = *block->Program(); + exec.reset(new Executor(dev_place)); + if (FLAGS_use_mkldnn) exec->EnableMKLDNN(pdesc); + ctx = exec->Prepare(pdesc, block->ID(), inside_grads, false); #ifdef PADDLE_WITH_MKLDNN - platform::AttachPointerHashToMKLDNNKey(exec.get(), dev_place); - platform::RegisterModelLayout(ctx->ops_, dev_place); + platform::AttachPointerHashToMKLDNNKey(exec.get(), dev_place); + platform::RegisterModelLayout(ctx->ops_, dev_place); #endif + } + exec->RunPreparedContext(ctx.get(), + &cur_scope, + /* create_local_scope */ false, + /* create_vars */ true, + /* keep_kids */ true); } - exec->RunPreparedContext(ctx.get(), &cur_scope, false, true, false); AssignLocalGradientToParentScope( dev_place, cur_scope, scope, inside_grads, outside_grads, inputs); @@ -190,6 +301,7 @@ class ConditionalBlockGradOp : public ConditionalOp { private: mutable std::shared_ptr exec{nullptr}; mutable std::unique_ptr ctx{nullptr}; + mutable std::shared_ptr core{nullptr}; private: void AssignLocalGradientToParentScope( @@ -204,7 +316,8 @@ class ConditionalBlockGradOp : public ConditionalOp { for (size_t i = 0; i < outside_grads.size(); ++i) { const std::string &outside_grad_name = outside_grads[i]; const std::string &inside_grad_name = inside_grads[i]; - VLOG(4) << "inside_grad_name = " << inside_grad_name + VLOG(4) << "[assign local]" + << "inside_grad_name = " << inside_grad_name << ", outside_grad_name = " << outside_grad_name; framework::Variable *outside_var = parent_scope.FindVar(outside_grad_name); @@ -237,7 +350,8 @@ class ConditionalBlockGradOp : public ConditionalOp { for (size_t i = 0; i < outside_grads.size(); ++i) { const std::string &outside_grad_name = outside_grads[i]; const std::string &input_name = inputs[i]; - VLOG(4) << "input_name = " << input_name + VLOG(4) << "[assign zero]" + << "input_name = " << input_name << ", outside_grad_name = " << outside_grad_name; framework::Variable *input_var = scope.FindVar(input_name); if (input_var == nullptr) { diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index 0c44b3c5a71d8..f7177309006dd 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/data_layout_transform.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/memcpy_kernel.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/phi/backends/onednn/onednn_helper.h" #endif @@ -157,6 +158,13 @@ void TransferLayoutKernel(const Context& dev_ctx, VLOG(10) << "TransDataLayout from " << static_cast(src_layout) << " -> " << static_cast(dst_layout); + VLOG_IF(10, x.initialized()) << "TransDataLayout from " << x.layout(); + if (x.layout() == static_cast(dst_layout)) { + VLOG(10) << "No need to transform, already is " << x.layout(); + Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + return; + } + #ifdef PADDLE_WITH_MKLDNN TransferLayoutMKLDNN(dev_ctx, x, From b03b4a3c30be813cc26a1ea66c57dbae419583d9 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Mon, 31 Oct 2022 12:54:38 +0800 Subject: [PATCH 38/91] [Auto Parallel] Improve the c++ dist attr (#47358) * [Auto Parallel] Improve the c++ dist attr * [Auto Parallel] Modify test_program.py * [Auto Parallel] Add the missiong import --- .../distributed/auto_parallel/dist_attr.cc | 339 +++++++++++++----- .../distributed/auto_parallel/dist_attr.h | 41 ++- .../auto_parallel/test/dist_attr_test.cc | 10 +- .../fluid/distributed/auto_parallel/utils.h | 2 +- paddle/fluid/framework/attribute.cc | 26 +- paddle/fluid/framework/attribute.h | 2 + paddle/fluid/framework/op_desc.cc | 8 + paddle/fluid/framework/var_desc.cc | 83 ++++- paddle/fluid/framework/var_desc.h | 8 +- paddle/fluid/pybind/auto_parallel_py.cc | 49 ++- .../distributed/auto_parallel/dist_context.py | 19 +- .../paddle/distributed/auto_parallel/utils.py | 117 ++++++ python/paddle/fluid/framework.py | 2 +- .../unittests/auto_parallel/CMakeLists.txt | 1 + .../auto_parallel/test_dist_attr_v2.py | 285 ++++++++++++++- .../auto_parallel/test_serialization.py | 287 +++++++++++++++ .../fluid/tests/unittests/test_program.py | 8 +- 17 files changed, 1151 insertions(+), 136 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_serialization.py diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.cc b/paddle/fluid/distributed/auto_parallel/dist_attr.cc index 53e28b7a904bf..57a5b40768af5 100644 --- a/paddle/fluid/distributed/auto_parallel/dist_attr.cc +++ b/paddle/fluid/distributed/auto_parallel/dist_attr.cc @@ -29,36 +29,57 @@ namespace auto_parallel { std::vector TensorDistAttr::fields_{ "process_mesh", "dims_mapping", "batch_dim", "dynamic_dims"}; -TensorDistAttr::TensorDistAttr(const VarDesc& tensor) - : tensor_(&tensor), batch_dim_(0) { +TensorDistAttr::TensorDistAttr(const VarDesc& tensor) : tensor_(&tensor) { + VLOG(4) << "[TensorDistAttr constructor] tensor name: " << tensor_->Name(); + if (tensor_->GetType() == framework::proto::VarType::READER) return; + if (tensor_->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY) return; + if (tensor_->GetType() == framework::proto::VarType::STEP_SCOPES) return; + tensor_shape_ = tensor_->GetShape(); + VLOG(4) << "[TensorDistAttr constructor] tensor shape: " + << str_join(tensor_shape_); set_default_dims_mapping(); - std::vector tensor_shape = tensor_->GetShape(); - for (std::size_t i = 0; i < tensor_shape.size(); ++i) { + for (std::size_t i = 0; i < tensor_shape_.size(); ++i) { dynamic_dims_.push_back(false); } } TensorDistAttr::TensorDistAttr(const TensorDistAttr& dist_attr) { if (tensor_ == nullptr) { - tensor_ = dist_attr.tensor(); + tensor_ = dist_attr.tensor_; + tensor_shape_ = dist_attr.tensor_shape_; } - set_process_mesh(dist_attr.process_mesh()); - set_dims_mapping(dist_attr.dims_mapping()); - set_batch_dim(dist_attr.batch_dim()); - set_dynamic_dims(dist_attr.dynamic_dims()); - set_annotated(dist_attr.annotated()); + if (tensor_ != nullptr) { + VLOG(4) << "[TensorDistAttr copy constructor] tensor name: " + << tensor_->Name() << ", tensro shape: " << str_join(tensor_shape_); + } else { + VLOG(4) << "[TensorDistAttr copy constructor] tensor name: None" + << ", tensro shape: " << str_join(tensor_shape_); + } + copy_from(dist_attr); } TensorDistAttr& TensorDistAttr::operator=(const TensorDistAttr& dist_attr) { if (tensor_ == nullptr) { - tensor_ = dist_attr.tensor(); + tensor_ = dist_attr.tensor_; + tensor_shape_ = dist_attr.tensor_shape_; } + if (tensor_ != nullptr) { + VLOG(4) << "[TensorDistAttr assign constructor] tensor name: " + << tensor_->Name() << ", tensro shape: " << str_join(tensor_shape_); + } else { + VLOG(4) << "[TensorDistAttr assign constructor] tensor name: None" + << ", tensro shape: " << str_join(tensor_shape_); + } + copy_from(dist_attr); + return *this; +} + +void TensorDistAttr::copy_from(const TensorDistAttr& dist_attr) { set_process_mesh(dist_attr.process_mesh()); set_dims_mapping(dist_attr.dims_mapping()); set_batch_dim(dist_attr.batch_dim()); set_dynamic_dims(dist_attr.dynamic_dims()); set_annotated(dist_attr.annotated()); - return *this; } void TensorDistAttr::set_process_mesh(const ProcessMesh& process_mesh) { @@ -84,9 +105,9 @@ void TensorDistAttr::set_batch_dim(int64_t batch_dim) { true, platform::errors::InvalidArgument( "Wrong batch_dim %d in this distributed attribute.", batch_dim)); - if (tensor_ != nullptr) { - std::vector tensor_shape = tensor_->GetShape(); - int64_t canonical_batch_dim = canonical_dim(batch_dim, tensor_shape.size()); + if (tensor_ != nullptr && tensor_shape_.size() > 0) { + int64_t canonical_batch_dim = + canonical_dim(batch_dim, tensor_shape_.size()); batch_dim_ = canonical_batch_dim; } else { batch_dim_ = batch_dim; @@ -113,8 +134,7 @@ void TensorDistAttr::set_annotated( void TensorDistAttr::set_default_dims_mapping() { if (tensor_ != nullptr) { - std::vector tensor_shape = tensor_->GetShape(); - dims_mapping_ = std::vector(tensor_shape.size(), -1); + dims_mapping_ = std::vector(tensor_shape_.size(), -1); } } @@ -127,6 +147,8 @@ void TensorDistAttr::annotate(const std::string& name) { bool TensorDistAttr::verify_process_mesh( const ProcessMesh& process_mesh) const { + VLOG(4) << "[TensorDistAttr verify_process_mesh] " + << process_mesh.to_string(); if (!process_mesh_.empty()) { for (int64_t dim_mapping : dims_mapping_) { if (dim_mapping < -1 || dim_mapping >= process_mesh_.ndim()) { @@ -139,11 +161,9 @@ bool TensorDistAttr::verify_process_mesh( bool TensorDistAttr::verify_dims_mapping( const std::vector& dims_mapping) const { - if (tensor_ != nullptr) { - std::vector tensor_shape = tensor_->GetShape(); - if (dims_mapping.size() != tensor_shape.size()) { - return false; - } + VLOG(4) << "[TensorDistAttr verify_dims_mapping] " << str_join(dims_mapping); + if (dims_mapping.size() != tensor_shape_.size()) { + return false; } std::unordered_map map; if (!process_mesh_.empty()) { @@ -168,9 +188,9 @@ bool TensorDistAttr::verify_dims_mapping( } bool TensorDistAttr::verify_batch_dim(int64_t dim) const { - if (tensor_ != nullptr) { - std::vector tensor_shape = tensor_->GetShape(); - int64_t ndim = tensor_shape.size(); + VLOG(4) << "[TensorDistAttr verify_batch_dim] " << dim; + int64_t ndim = tensor_shape_.size(); + if (tensor_ != nullptr && ndim > 0) { if (dim < 0) { dim = dim + ndim; } @@ -183,17 +203,16 @@ bool TensorDistAttr::verify_batch_dim(int64_t dim) const { bool TensorDistAttr::verify_dynamic_dims( const std::vector& dynamic_dims) const { - if (tensor_ != nullptr) { - std::vector tensor_shape = tensor_->GetShape(); - if (dynamic_dims.size() != tensor_shape.size()) { - return false; - } + VLOG(4) << "[TensorDistAttr verify_dynamic_dims] " << str_join(dynamic_dims); + if (dynamic_dims.size() != tensor_shape_.size()) { + return false; } return true; } bool TensorDistAttr::verify_annotated( const std::map& annotated) const { + VLOG(4) << "[TensorDistAttr verify_annotated] " << str_join(annotated); for (const auto& item : annotated) { auto result = std::find(std::begin(fields_), std::end(fields_), item.first); if (result == std::end(fields_)) { @@ -204,9 +223,6 @@ bool TensorDistAttr::verify_annotated( } bool TensorDistAttr::verify() const { - if (tensor_ == nullptr) { - return false; - } if (!verify_process_mesh(process_mesh_)) { return false; } @@ -240,19 +256,17 @@ std::string TensorDistAttr::to_string() const { return dist_str; } -TensorDistAttr TensorDistAttr::from_proto(const TensorDistAttrProto& proto) { - TensorDistAttr dist_attr; - dist_attr.process_mesh_ = ProcessMesh::from_proto(proto.process_mesh()); - dist_attr.dims_mapping_.resize(proto.dims_mapping_size()); +void TensorDistAttr::from_proto(const TensorDistAttrProto& proto) { + process_mesh_ = ProcessMesh::from_proto(proto.process_mesh()); + dims_mapping_.resize(proto.dims_mapping_size()); for (int64_t i = 0; i < proto.dims_mapping_size(); ++i) { - dist_attr.dims_mapping_[i] = proto.dims_mapping(i); + dims_mapping_[i] = proto.dims_mapping(i); } - dist_attr.batch_dim_ = proto.batch_dim(); - dist_attr.dynamic_dims_.resize(proto.dynamic_dims_size()); + batch_dim_ = proto.batch_dim(); + dynamic_dims_.resize(proto.dynamic_dims_size()); for (int64_t i = 0; i < proto.dynamic_dims_size(); ++i) { - dist_attr.dynamic_dims_[i] = proto.dynamic_dims(i); + dynamic_dims_[i] = proto.dynamic_dims(i); } - return dist_attr; } TensorDistAttrProto TensorDistAttr::to_proto() const { @@ -268,6 +282,26 @@ TensorDistAttrProto TensorDistAttr::to_proto() const { return proto; } +std::string TensorDistAttr::serialize_to_string() { + std::string data; + auto proto = to_proto(); + proto.SerializeToString(&data); + PADDLE_ENFORCE_EQ(to_proto().SerializeToString(&data), + true, + platform::errors::InvalidArgument( + "Failed to serialize tensor dist attr to string.")); + return data; +} + +void TensorDistAttr::parse_from_string(const std::string& data) { + TensorDistAttrProto proto; + PADDLE_ENFORCE_EQ(proto.ParseFromString(data), + true, + platform::errors::InvalidArgument( + "Failed to parse tensor dist attr from string.")); + from_proto(proto); +} + bool operator==(const TensorDistAttr& lhs, const TensorDistAttr& rhs) { if (lhs.process_mesh() != rhs.process_mesh()) { return false; @@ -288,52 +322,103 @@ std::vector OperatorDistAttr::fields_{ "process_mesh", "impl_type", "impl_idx"}; OperatorDistAttr::OperatorDistAttr(const OpDesc& op) : op_(&op) { + VLOG(4) << "[OperatorDistAttr constructor] op type: " << op_->Type(); + initialize(); +} + +OperatorDistAttr::OperatorDistAttr(const OperatorDistAttr& dist_attr) { + if (op_ == nullptr) { + op_ = dist_attr.op(); + } + if (op_ != nullptr) { + VLOG(4) << "[OperatorDistAttr copy constructor] op type: " << op_->Type(); + } else { + VLOG(4) << "[OperatorDistAttr copy constructor] op type: None"; + } + initialize(); + copy_from(dist_attr); +} + +OperatorDistAttr& OperatorDistAttr::operator=( + const OperatorDistAttr& dist_attr) { + if (op_ == nullptr) { + op_ = dist_attr.op(); + } + if (op_ != nullptr) { + VLOG(4) << "[OperatorDistAttr assign constructor] op type: " << op_->Type(); + } else { + VLOG(4) << "[OperatorDistAttr assign constructor] op type: None"; + } + initialize(); + copy_from(dist_attr); + return *this; +} + +void OperatorDistAttr::initialize() { + if (op_ == nullptr) return; for (std::string name : op_->InputArgumentNames()) { VarDesc* input = op_->Block()->FindVarRecursive(name); + VLOG(4) << "[OperatorDistAttr create input dist attr] " << name; inputs_[name] = input; - input_dist_attrs_[name] = TensorDistAttr(*input); + if (input == nullptr || op_->Type() == "create_py_reader") { + input_dist_attrs_[name] = TensorDistAttr(); + } else { + input_dist_attrs_[name] = TensorDistAttr(*input); + } } for (std::string name : op_->OutputArgumentNames()) { VarDesc* output = op_->Block()->FindVarRecursive(name); + VLOG(4) << "[OperatorDistAttr create output dist attr] " << name; outputs_[name] = output; - output_dist_attrs_[name] = TensorDistAttr(*output); + if (output == nullptr) { + output_dist_attrs_[name] = TensorDistAttr(); + } else { + output_dist_attrs_[name] = TensorDistAttr(*output); + } } impl_type_ = "default"; impl_idx_ = 0; } -OperatorDistAttr::OperatorDistAttr(const OperatorDistAttr& dist_attr) { - if (op_ == nullptr) { - op_ = dist_attr.op(); - } - for (const auto& item : dist_attr.input_dist_attrs()) { - set_input_dist_attr(item.first, item.second); - } - for (const auto& item : dist_attr.output_dist_attrs()) { - set_output_dist_attr(item.first, item.second); - } +void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) { + set_input_dist_attrs(dist_attr.input_dist_attrs()); + set_output_dist_attrs(dist_attr.output_dist_attrs()); set_process_mesh(dist_attr.process_mesh()); set_impl_type(dist_attr.impl_type()); set_impl_idx(dist_attr.impl_idx()); set_annotated(dist_attr.annotated()); + impl_type_ = dist_attr.impl_type(); + impl_idx_ = dist_attr.impl_idx(); } -OperatorDistAttr& OperatorDistAttr::operator=( - const OperatorDistAttr& dist_attr) { +void OperatorDistAttr::set_input_dist_attrs( + const std::map& dist_attrs) { if (op_ == nullptr) { - op_ = dist_attr.op(); - } - for (const auto& item : dist_attr.input_dist_attrs()) { - set_input_dist_attr(item.first, item.second); + for (const auto& item : dist_attrs) { + set_input_dist_attr(item.first, item.second); + } + } else { + for (const auto& item : input_dist_attrs_) { + if (dist_attrs.count(item.first) == 1) { + set_input_dist_attr(item.first, dist_attrs.at(item.first)); + } + } } - for (const auto& item : dist_attr.output_dist_attrs()) { - set_output_dist_attr(item.first, item.second); +} + +void OperatorDistAttr::set_output_dist_attrs( + const std::map& dist_attrs) { + if (op_ == nullptr) { + for (const auto& item : dist_attrs) { + set_output_dist_attr(item.first, item.second); + } + } else { + for (const auto& item : output_dist_attrs_) { + if (dist_attrs.count(item.first) == 1) { + set_output_dist_attr(item.first, dist_attrs.at(item.first)); + } + } } - set_process_mesh(dist_attr.process_mesh()); - set_impl_type(dist_attr.impl_type()); - set_impl_idx(dist_attr.impl_idx()); - set_annotated(dist_attr.annotated()); - return *this; } void OperatorDistAttr::set_input_dist_attr(const std::string& name, @@ -341,8 +426,10 @@ void OperatorDistAttr::set_input_dist_attr(const std::string& name, PADDLE_ENFORCE_EQ( verify_input_dist_attr(name, dist_attr), true, - platform::errors::InvalidArgument( - "Wrong dist_attr %s for %s.", dist_attr.to_string(), name)); + platform::errors::InvalidArgument("Wrong dist_attr %s for %s. %s", + dist_attr.to_string(), + name, + to_string())); input_dist_attrs_[name] = dist_attr; // Make sure the process mesh of input be same as that of the op input_dist_attrs_[name].set_process_mesh(process_mesh_); @@ -394,8 +481,30 @@ void OperatorDistAttr::set_annotated( annotated_ = annotated; } +const std::vector& OperatorDistAttr::input_dims_mapping( + const std::string& name) const { + return input_dist_attr(name).dims_mapping(); +} + +void OperatorDistAttr::set_input_dims_mapping( + const std::string& name, const std::vector& dims_mapping) { + input_dist_attr(name).set_dims_mapping(dims_mapping); +} + +const std::vector& OperatorDistAttr::output_dims_mapping( + const std::string& name) { + return output_dist_attr(name).dims_mapping(); +} + +void OperatorDistAttr::set_output_dims_mapping( + const std::string& name, const std::vector& dims_mapping) { + output_dist_attr(name).set_dims_mapping(dims_mapping); +} + bool OperatorDistAttr::verify_input_dist_attr( const std::string& name, const TensorDistAttr& dist_attr) const { + VLOG(4) << "[OperatorDistAttr verify_input_dist_attr] " << name << " " + << dist_attr.to_string(); if (!dist_attr.verify()) { return false; } @@ -414,6 +523,8 @@ bool OperatorDistAttr::verify_input_dist_attr( bool OperatorDistAttr::verify_output_dist_attr( const std::string& name, const TensorDistAttr& dist_attr) const { + VLOG(4) << "[OperatorDistAttr verify_output_dist_attr] " << name << " " + << dist_attr.to_string(); if (!dist_attr.verify()) { return false; } @@ -432,6 +543,8 @@ bool OperatorDistAttr::verify_output_dist_attr( bool OperatorDistAttr::verify_process_mesh( const ProcessMesh& process_mesh) const { + VLOG(4) << "[OperatorDistAttr verify_process_mesh] " + << process_mesh.to_string(); if (process_mesh != process_mesh_) { return false; } @@ -450,6 +563,7 @@ bool OperatorDistAttr::verify_process_mesh( bool OperatorDistAttr::verify_annotated( const std::map& annotated) const { + VLOG(4) << "[OperatorDistAttr verify_annotated] " << str_join(annotated); for (const auto& item : annotated) { auto result = std::find(std::begin(fields_), std::end(fields_), item.first); if (result == std::end(fields_)) { @@ -457,11 +571,15 @@ bool OperatorDistAttr::verify_annotated( } } for (auto& item : input_dist_attrs_) { + VLOG(4) << "[OperatorDistAttr verify_annotated input] " + << str_join(item.second.annotated()); if (!item.second.verify_annotated(item.second.annotated())) { return false; } } for (auto& item : output_dist_attrs_) { + VLOG(4) << "[OperatorDistAttr verify_annotated output] " + << str_join(item.second.annotated()); if (!item.second.verify_annotated(item.second.annotated())) { return false; } @@ -501,6 +619,44 @@ bool OperatorDistAttr::verify() const { return true; } +void OperatorDistAttr::rename_input(const std::string& old_name, + const std::string& new_name) { + for (auto& item : input_dist_attrs_) { + if (item.first == old_name) { + VarDesc* new_input = op_->Block()->FindVarRecursive(new_name); + inputs_[new_name] = new_input; + if (new_input == nullptr) { + input_dist_attrs_[new_name] = TensorDistAttr(); + } else { + input_dist_attrs_[new_name] = TensorDistAttr(*new_input); + input_dist_attrs_[new_name].copy_from(input_dist_attrs_[old_name]); + } + inputs_.erase(old_name); + input_dist_attrs_.erase(old_name); + break; + } + } +} + +void OperatorDistAttr::rename_output(const std::string& old_name, + const std::string& new_name) { + for (auto& item : output_dist_attrs_) { + if (item.first == old_name) { + VarDesc* new_output = op_->Block()->FindVarRecursive(new_name); + outputs_[new_name] = new_output; + if (new_output == nullptr) { + output_dist_attrs_[new_name] = TensorDistAttr(); + } else { + output_dist_attrs_[new_name] = TensorDistAttr(*new_output); + output_dist_attrs_[new_name].copy_from(output_dist_attrs_[old_name]); + } + outputs_.erase(old_name); + output_dist_attrs_.erase(old_name); + break; + } + } +} + std::string OperatorDistAttr::to_string() const { std::string str; if (op_ != nullptr) { @@ -525,23 +681,22 @@ std::string OperatorDistAttr::to_string() const { return str; } -OperatorDistAttr OperatorDistAttr::from_proto( - const OperatorDistAttrProto& proto) { - OperatorDistAttr dist_attr; +void OperatorDistAttr::from_proto(const OperatorDistAttrProto& proto) { for (int64_t i = 0; i < proto.input_dist_attrs_size(); ++i) { - dist_attr.input_dist_attrs_[proto.input_dist_attrs(i).name()] = - TensorDistAttr::from_proto( - proto.input_dist_attrs(i).tensor_dist_attr()); + TensorDistAttr dist_attr; + std::string name = proto.input_dist_attrs(i).name(); + dist_attr.from_proto(proto.input_dist_attrs(i).tensor_dist_attr()); + input_dist_attrs_[name] = dist_attr; } for (int64_t i = 0; i < proto.output_dist_attrs_size(); ++i) { - dist_attr.output_dist_attrs_[proto.output_dist_attrs(i).name()] = - TensorDistAttr::from_proto( - proto.output_dist_attrs(i).tensor_dist_attr()); + TensorDistAttr dist_attr; + std::string name = proto.output_dist_attrs(i).name(); + dist_attr.from_proto(proto.output_dist_attrs(i).tensor_dist_attr()); + output_dist_attrs_[name] = dist_attr; } - dist_attr.process_mesh_ = ProcessMesh::from_proto(proto.process_mesh()); - dist_attr.impl_type_ = proto.impl_type(); - dist_attr.impl_idx_ = proto.impl_idx(); - return dist_attr; + process_mesh_ = ProcessMesh::from_proto(proto.process_mesh()); + impl_type_ = proto.impl_type(); + impl_idx_ = proto.impl_idx(); } OperatorDistAttrProto OperatorDistAttr::to_proto() const { @@ -562,6 +717,26 @@ OperatorDistAttrProto OperatorDistAttr::to_proto() const { return proto; } +std::string OperatorDistAttr::serialize_to_string() { + std::string data; + auto proto = to_proto(); + proto.SerializeToString(&data); + PADDLE_ENFORCE_EQ(to_proto().SerializeToString(&data), + true, + platform::errors::InvalidArgument( + "Failed to serialize op dist attr to string.")); + return data; +} + +void OperatorDistAttr::parse_from_string(const std::string& data) { + OperatorDistAttrProto proto; + PADDLE_ENFORCE_EQ(proto.ParseFromString(data), + true, + platform::errors::InvalidArgument( + "Failed to parse op dist attr from string.")); + from_proto(proto); +} + bool operator==(const OperatorDistAttr& lhs, const OperatorDistAttr& rhs) { if (lhs.process_mesh() != rhs.process_mesh()) { return false; diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.h b/paddle/fluid/distributed/auto_parallel/dist_attr.h index 44262f5bace92..d4aa306e71273 100644 --- a/paddle/fluid/distributed/auto_parallel/dist_attr.h +++ b/paddle/fluid/distributed/auto_parallel/dist_attr.h @@ -56,6 +56,8 @@ class TensorDistAttr { TensorDistAttr& operator=(const TensorDistAttr& dist_attr); + void copy_from(const TensorDistAttr& dist_attr); + const VarDesc* tensor() const { return tensor_; } const ProcessMesh& process_mesh() const { return process_mesh_; } @@ -101,16 +103,21 @@ class TensorDistAttr { // TensorDistAttr from_string(const std::string& dist_str); std::string to_string() const; - static TensorDistAttr from_proto(const TensorDistAttrProto& proto); + void from_proto(const TensorDistAttrProto& proto); TensorDistAttrProto to_proto() const; + std::string serialize_to_string(); + + void parse_from_string(const std::string& data); + private: static std::vector fields_; const VarDesc* tensor_{nullptr}; + std::vector tensor_shape_; ProcessMesh process_mesh_; std::vector dims_mapping_; - int64_t batch_dim_; + int64_t batch_dim_{0}; std::vector dynamic_dims_; std::map annotated_; }; @@ -136,6 +143,10 @@ class OperatorDistAttr { OperatorDistAttr& operator=(const OperatorDistAttr& dist_attr); + void initialize(); + + void copy_from(const OperatorDistAttr& dist_attr); + const OpDesc* op() const { return op_; } const VarDesc& input(const std::string& name) const { @@ -150,10 +161,16 @@ class OperatorDistAttr { return input_dist_attrs_; } + void set_input_dist_attrs( + const std::map& dist_attrs); + const std::map& output_dist_attrs() const { return output_dist_attrs_; } + void set_output_dist_attrs( + const std::map& dist_attrs); + const TensorDistAttr& input_dist_attr(const std::string& name) const { return input_dist_attrs_.at(name); } @@ -198,6 +215,16 @@ class OperatorDistAttr { void annotate(const std::string& name); + const std::vector& input_dims_mapping(const std::string& name) const; + + void set_input_dims_mapping(const std::string& name, + const std::vector& dims_mapping); + + const std::vector& output_dims_mapping(const std::string& name); + + void set_output_dims_mapping(const std::string& name, + const std::vector& dims_mapping); + bool verify_input_dist_attr(const std::string& name, const TensorDistAttr& dist_attr) const; @@ -210,13 +237,21 @@ class OperatorDistAttr { bool verify() const; + void rename_input(const std::string& old_name, const std::string& new_name); + + void rename_output(const std::string& old_name, const std::string& new_name); + // OperatorDistAttr from_string(const std::string& dist_str); std::string to_string() const; - static OperatorDistAttr from_proto(const OperatorDistAttrProto& proto); + void from_proto(const OperatorDistAttrProto& proto); OperatorDistAttrProto to_proto() const; + std::string serialize_to_string(); + + void parse_from_string(const std::string& data); + private: static std::vector fields_; const OpDesc* op_{nullptr}; diff --git a/paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc b/paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc index e2f035584c1c3..d313decee6f24 100644 --- a/paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc +++ b/paddle/fluid/distributed/auto_parallel/test/dist_attr_test.cc @@ -81,10 +81,9 @@ TEST(DistAttr, ctor) { x_sstream << x_dist_attr; EXPECT_EQ(x_sstream.str(), x_dist_attr.to_string()); auto x_proto = x_dist_attr.to_proto(); - TensorDistAttr new_x_dist_attr = TensorDistAttr::from_proto(x_proto); + TensorDistAttr new_x_dist_attr(*x); + new_x_dist_attr.from_proto(x_proto); EXPECT_EQ(x_dist_attr, new_x_dist_attr); - // new_x_dist_attr is not valid since it does not bind to an var_desc - EXPECT_EQ(new_x_dist_attr.verify(), false); y_dist_attr.set_process_mesh(process_mesh); y_dist_attr.set_dims_mapping(std::vector({-1, 0})); @@ -139,10 +138,9 @@ TEST(DistAttr, ctor) { mul_sstream << mul_dist_attr; EXPECT_EQ(mul_sstream.str(), mul_dist_attr.to_string()); auto mul_proto = mul_dist_attr.to_proto(); - OperatorDistAttr new_mul_dist_attr = OperatorDistAttr::from_proto(mul_proto); + OperatorDistAttr new_mul_dist_attr(*op); + new_mul_dist_attr.from_proto(mul_proto); EXPECT_EQ(mul_dist_attr, new_mul_dist_attr); - // new_mul_dist_attr is not valid since it does not bind to an op_desc - EXPECT_EQ(new_mul_dist_attr.verify(), false); } } // namespace auto_parallel diff --git a/paddle/fluid/distributed/auto_parallel/utils.h b/paddle/fluid/distributed/auto_parallel/utils.h index de4162730b19c..58f9425f5329f 100644 --- a/paddle/fluid/distributed/auto_parallel/utils.h +++ b/paddle/fluid/distributed/auto_parallel/utils.h @@ -82,7 +82,7 @@ inline std::string str_join(std::map const& elements, for (const auto& item : elements) { str += item.first + ": " + std::to_string(item.second) + ","; } - return str.substr(0, str.size() - 2); + return str.substr(0, str.size() - 1); } // Refer to https://stackoverflow.com/a/46931770 diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc index dd456b147ac8d..2c34f9a928f12 100644 --- a/paddle/fluid/framework/attribute.cc +++ b/paddle/fluid/framework/attribute.cc @@ -119,8 +119,30 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { } default: - PADDLE_THROW(platform::errors::Unavailable("Unsupport attribute type %d.", - attr_desc.type())); + PADDLE_THROW(platform::errors::Unavailable( + "Unsupported attribute type %d.", attr_desc.type())); + } + return paddle::blank(); +} + +Attribute GetAttrValue(const proto::VarDesc::Attr& attr_desc) { + switch (attr_desc.type()) { + case proto::AttrType::INT: { + return attr_desc.i(); + } + case proto::AttrType::STRING: { + return attr_desc.s(); + } + case proto::AttrType::INTS: { + std::vector val(attr_desc.ints_size()); + for (int i = 0; i < attr_desc.ints_size(); ++i) { + val[i] = attr_desc.ints(i); + } + return val; + } + default: + PADDLE_THROW(platform::errors::Unavailable( + "Unsupported attribute type %d.", attr_desc.type())); } return paddle::blank(); } diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index a82e8e7e76831..e1eba710c2780 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -37,6 +37,8 @@ paddle::any GetAttrValue(const Attribute& attr); Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc); +Attribute GetAttrValue(const proto::VarDesc::Attr& attr_desc); + template struct ExtractAttribute { explicit ExtractAttribute(const std::string& attr_name) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 0c783f7de448a..321230e86064b 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -885,6 +885,10 @@ void OpDesc::RenameOutput(const std::string &old_name, std::replace(op_vars.begin(), op_vars.end(), old_name, new_name); } + if (dist_attr_) { + dist_attr_->rename_output(old_name, new_name); + } + need_update_ = true; } @@ -900,6 +904,10 @@ void OpDesc::RenameInput(const std::string &old_name, std::replace(op_vars.begin(), op_vars.end(), old_name, new_name); } + if (dist_attr_) { + dist_attr_->rename_input(old_name, new_name); + } + need_update_ = true; } diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index d4a53c4135a08..55616588abea5 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/var_desc.h" #include "glog/logging.h" +#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/enforce.h" @@ -28,6 +29,16 @@ VarDesc::VarDesc(const VarDesc &other) if (other.dist_attr_) { dist_attr_.reset(new TensorDistAttr(*other.dist_attr_)); } + need_updated_ = true; +} + +VarDesc::VarDesc(const proto::VarDesc &desc) : desc_(desc) { + // Restore attrs_ for auto parallel + for (const proto::VarDesc::Attr &attr : desc_.attrs()) { + std::string attr_name = attr.name(); + attrs_[attr_name] = GetAttrValue(attr); + } + need_updated_ = true; } proto::VarType::Type VarDesc::GetType() const { return desc_.type().type(); } @@ -348,14 +359,15 @@ void VarDesc::SetAttr(const std::string &name, const Attribute &v) { bool valid = attr_type == proto::AttrType::INT || attr_type == proto::AttrType::STRING || attr_type == proto::AttrType::INTS; - PADDLE_ENFORCE_EQ( - valid, - true, - platform::errors::InvalidArgument("The value for attr (%s) must be " - "one of list or int or string.", - name)); + PADDLE_ENFORCE_EQ(valid, + true, + platform::errors::InvalidArgument( + "The value for attr (%s) must be " + "one of int, string, list of int for now.", + name)); this->attrs_[name] = v; + need_updated_ = true; } Attribute VarDesc::GetAttr(const std::string &name) const { @@ -367,6 +379,63 @@ Attribute VarDesc::GetAttr(const std::string &name) const { return it->second; } +struct SetVarAttrDescVisitor { + explicit SetVarAttrDescVisitor(proto::VarDesc::Attr *attr) : attr_(attr) {} + mutable proto::VarDesc::Attr *attr_; + + template + void operator()(T &&v) { + using U = std::decay_t; + if (std::is_same::value) { + set_attr_value(v); + } else if (std::is_same::value) { + set_attr_value(v); + } else if (std::is_same>::value) { + set_attr_value(v); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "Unsupported calling method of SetAttrDescVisitor object.")); + } + } + + // This template is used to pass the compilation + template + void set_attr_value(U v); + + void set_attr_value(int v) { attr_->set_i(v); } + + void set_attr_value(const std::string &v) { attr_->set_s(v); } + + void set_attr_value(const std::vector &v) { + VectorToRepeated(v, attr_->mutable_ints()); + } +}; + +// Only need to flush the attrs for auto parallel for now +void VarDesc::Flush() { + VLOG(4) << "Flush " + << " " << Name() << " " << need_updated_; + if (need_updated_) { + this->desc_.mutable_attrs()->Clear(); + std::vector> sorted_attrs{attrs_.begin(), + attrs_.end()}; + std::sort( + sorted_attrs.begin(), + sorted_attrs.end(), + [](std::pair a, + std::pair b) { return a.first < b.first; }); + for (auto &attr : sorted_attrs) { + auto *attr_desc = desc_.add_attrs(); + attr_desc->set_name(attr.first); + attr_desc->set_type( + static_cast(attr.second.index() - 1)); + SetVarAttrDescVisitor visitor(attr_desc); + paddle::visit(visitor, attr.second); + } + need_updated_ = false; + } +} + TensorDistAttr *VarDesc::MutableDistAttr() { // If dist_attr_ is nullptr, construct a new one and return. if (dist_attr_) { @@ -375,12 +444,14 @@ TensorDistAttr *VarDesc::MutableDistAttr() { dist_attr_.reset(new TensorDistAttr(*this)); return dist_attr_.get(); } + need_updated_ = true; } void VarDesc::SetDistAttr(const TensorDistAttr &dist_attr) { // Make sure this dist attr be created MutableDistAttr(); *dist_attr_ = dist_attr; + need_updated_ = true; } bool operator==(const VarDesc &left, const VarDesc &right) { diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index 63cbcb93420fe..ab60dfa56bd23 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -71,9 +71,7 @@ class VarDesc { need_updated_ = true; } - explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) { - // need_updated_ = true; - } + explicit VarDesc(const proto::VarDesc &desc); // Explicitly implement the copy constructor for auto parallel VarDesc(const VarDesc &other); @@ -90,7 +88,7 @@ class VarDesc { } proto::VarDesc *Proto() { - need_updated_ = true; + Flush(); // Only flush attrs for auto parallel return &desc_; } @@ -194,6 +192,8 @@ class VarDesc { bool NeedUpdate() const { return need_updated_; } void SetNeedUpdate(bool need) { need_updated_ = need; } + void Flush(); + // The following methods are only used for auto parallel. uint64_t Id() const { return id_; } uint64_t OriginalId() const { return original_id_; } diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc index 12cd79b264dd4..1e0bda0c9401f 100644 --- a/paddle/fluid/pybind/auto_parallel_py.cc +++ b/paddle/fluid/pybind/auto_parallel_py.cc @@ -165,6 +165,12 @@ void BindAutoParallel(py::module *m) { &DeviceMesh::dim_size)) .def(py::self == py::self) .def(py::self != py::self) + .def( + "__deepcopy__", + [](const TensorDistAttr &self, py::dict) { + return TensorDistAttr(self); + }, + py::arg("memo")) .def("__str__", &DeviceMesh::to_string); py::class_(*m, "TensorDistAttr") @@ -182,9 +188,17 @@ void BindAutoParallel(py::module *m) { .def_property("dynamic_dims", &TensorDistAttr::dynamic_dims, &TensorDistAttr::set_dynamic_dims) + .def_property("annotated", + &TensorDistAttr::annotated, + &TensorDistAttr::set_annotated) .def("is_annotated", &TensorDistAttr::is_annotated) .def("annotate", &TensorDistAttr::annotate) .def("verify", &TensorDistAttr::verify) + .def("serialize_to_string", + [](TensorDistAttr &self) { + return py::bytes(self.serialize_to_string()); + }) + .def("parse_from_string", &TensorDistAttr::parse_from_string) .def(py::self == py::self) .def(py::self != py::self) .def("__str__", &TensorDistAttr::to_string); @@ -201,20 +215,23 @@ void BindAutoParallel(py::module *m) { .def_property("impl_idx", &OperatorDistAttr::impl_idx, &OperatorDistAttr::set_impl_idx) + .def_property("annotated", + &OperatorDistAttr::annotated, + &OperatorDistAttr::set_annotated) + .def_property("inputs_dist_attrs", + &OperatorDistAttr::input_dist_attrs, + &OperatorDistAttr::set_input_dist_attrs) + .def_property("outputs_dist_attrs", + &OperatorDistAttr::output_dist_attrs, + &OperatorDistAttr::set_output_dist_attrs) .def("input", &OperatorDistAttr::input) .def("output", &OperatorDistAttr::output) - .def("input_dist_attrs", - &OperatorDistAttr::input_dist_attrs, - py::return_value_policy::reference) - .def("output_dist_attrs", - &OperatorDistAttr::output_dist_attrs, - py::return_value_policy::reference) - .def("input_dist_attr", + .def("get_input_dist_attr", static_cast( &OperatorDistAttr::input_dist_attr), py::return_value_policy::reference) - .def("output_dist_attr", + .def("get_output_dist_attr", static_cast( &OperatorDistAttr::output_dist_attr), @@ -223,9 +240,25 @@ void BindAutoParallel(py::module *m) { .def("set_output_dist_attr", &OperatorDistAttr::set_output_dist_attr) .def("is_annotated", &OperatorDistAttr::is_annotated) .def("annotate", &OperatorDistAttr::annotate) + .def("get_input_dims_mapping", &OperatorDistAttr::input_dims_mapping) + .def("set_input_dims_mapping", &OperatorDistAttr::set_input_dims_mapping) + .def("get_output_dims_mapping", &OperatorDistAttr::output_dims_mapping) + .def("set_output_dims_mapping", + &OperatorDistAttr::set_output_dims_mapping) .def("verify", &OperatorDistAttr::verify) + .def("serialize_to_string", + [](OperatorDistAttr &self) { + return py::bytes(self.serialize_to_string()); + }) + .def("parse_from_string", &OperatorDistAttr::parse_from_string) .def(py::self == py::self) .def(py::self != py::self) + .def( + "__deepcopy__", + [](const OperatorDistAttr &self, py::dict) { + return OperatorDistAttr(self); + }, + py::arg("memo")) .def("__str__", &OperatorDistAttr::to_string); } diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index 12c224ac27fa5..4b4fca8730cd9 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -21,8 +21,10 @@ from .dist_tensor import DistributedTensor from .dist_op import DistributedOperator from .process_mesh import ProcessMesh +from .utils import _copy_dist_attr_to_cpp from .utils import is_loss_grad_op + # There always exists a default context for user. And user can set it to another one. _g_default_distributed_context = None @@ -76,6 +78,7 @@ def __init__( self._serial_optimizer = None self._serial_feed_vars = {} self._serial_fetch_vars = {} + self._lr_optimizer = None # record the optimzier holding lr_scheduler # Data members related to the program self._dist_tensors_for_program = {} @@ -392,7 +395,7 @@ def _restore( if dist: self._restore_dist_info(dist_mode) - def initialize(self, with_graph=True): + def initialize(self, with_graph=True, with_cpp=False): if not self._is_initialized: if not self._serial_main_program: if self._original_serial_main_program: @@ -425,6 +428,10 @@ def initialize(self, with_graph=True): self._ops_ids = list(self._dist_ops_for_program.keys()) self._is_initialized = True + # TODO: This will be removed in the future + if with_cpp: + _copy_dist_attr_to_cpp(self) + if with_graph: set_flags({"FLAGS_convert_all_blocks": True}) self._serial_graph = framework.IrGraph( @@ -597,7 +604,11 @@ def _init_dist_attr_for_program(self, no_default=False): tensor ) if default_dist_tensor and default_ctx is not self: - self.add_dist_tensor_for_program(default_dist_tensor) + dist_tensor = DistributedTensor(tensor) + dist_tensor.dist_attr = copy.deepcopy( + default_dist_tensor.dist_attr + ) + self.add_dist_tensor_for_program(dist_tensor) current_dist_tensor = self.get_dist_tensor_for_program(tensor) if current_dist_tensor is None: dist_tensor = DistributedTensor(tensor) @@ -606,7 +617,9 @@ def _init_dist_attr_for_program(self, no_default=False): # Copy the distributed operators in the default context default_dist_op = default_ctx.get_dist_op_for_program(op) if default_dist_op and default_ctx is not self: - self.add_dist_op_for_program(default_dist_op) + dist_op = DistributedOperator(op) + dist_op.dist_attr = copy.deepcopy(default_dist_op.dist_attr) + self.add_dist_op_for_program(dist_op) current_dist_op = self.get_dist_op_for_program(op) if current_dist_op is None: dist_op = DistributedOperator(op) diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 73d1c1412d1e3..c22ad2b831d4e 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1907,3 +1907,120 @@ def validate_opt(optimizer): optimizer._parameter_list = None optimizer._param_groups = None return optimizer + + +def _copy_tensor_dist_attr_to_cpp(cpp_dist_attr, py_dist_attr): + py_process_mesh = py_dist_attr.process_mesh + if py_process_mesh is not None: + cpp_dist_attr.process_mesh = core.ProcessMesh( + py_process_mesh.shape, + py_process_mesh.process_ids, + ["d" + str(i) for i in range(len(py_process_mesh.shape))], + ) + cpp_dist_attr.dims_mapping = py_dist_attr.dims_mapping + cpp_dist_attr.annotated = py_dist_attr._is_annotated + + +def _copy_tensor_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr): + from .process_mesh import ProcessMesh + + cpp_process_mesh = cpp_dist_attr.process_mesh + if not cpp_process_mesh.empty(): + py_dist_attr.process_mesh = ProcessMesh( + shape=cpp_process_mesh.shape, + process_ids=cpp_process_mesh.process_ids, + ) + py_dist_attr.dims_mapping = cpp_dist_attr.dims_mapping + py_dist_attr._is_annotated = cpp_dist_attr.annotated + + +def _copy_op_dist_attr_to_cpp(cpp_dist_attr, py_dist_attr): + py_process_mesh = py_dist_attr.process_mesh + if py_process_mesh is not None: + cpp_dist_attr.process_mesh = core.ProcessMesh( + py_process_mesh.shape, + py_process_mesh.process_ids, + ["d" + str(i) for i in range(len(py_process_mesh.shape))], + ) + cpp_dist_attr.impl_type = py_dist_attr.impl_type + cpp_dist_attr.impl_idx = py_dist_attr.impl_idx + cpp_dist_attr.annotated = py_dist_attr._is_annotated + for name, py_tensor_dist_attr in py_dist_attr.inputs_dist_attrs.items(): + cpp_tensor_dist_attr = cpp_dist_attr.get_input_dist_attr(name) + _copy_tensor_dist_attr_to_cpp(cpp_tensor_dist_attr, py_tensor_dist_attr) + for name, py_tensor_dist_attr in py_dist_attr.outputs_dist_attrs.items(): + cpp_tensor_dist_attr = cpp_dist_attr.get_output_dist_attr(name) + _copy_tensor_dist_attr_to_cpp(cpp_tensor_dist_attr, py_tensor_dist_attr) + + +def _copy_op_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr): + from .process_mesh import ProcessMesh + + cpp_process_mesh = cpp_dist_attr.process_mesh + if not cpp_process_mesh.empty(): + py_dist_attr.process_mesh = ProcessMesh( + shape=cpp_process_mesh.shape, + process_ids=cpp_process_mesh.process_ids, + ) + py_dist_attr.impl_type = cpp_dist_attr.impl_type + py_dist_attr.impl_idx = cpp_dist_attr.impl_idx + py_dist_attr._is_annotated = cpp_dist_attr.annotated + py_dist_attr.op_type = cpp_dist_attr.op.type() + for name, cpp_tensor_dist_attr in cpp_dist_attr.inputs_dist_attrs.items(): + py_tensor_dist_attr = py_dist_attr.get_input_dist_attr(name) + _copy_tensor_dist_attr_from_cpp( + cpp_tensor_dist_attr, py_tensor_dist_attr + ) + for name, cpp_tensor_dist_attr in cpp_dist_attr.outputs_dist_attrs.items(): + py_tensor_dist_attr = py_dist_attr.get_output_dist_attr(name) + _copy_tensor_dist_attr_from_cpp( + cpp_tensor_dist_attr, py_tensor_dist_attr + ) + + +def _copy_dist_attr_to_cpp(dist_context): + for dist_tensor in dist_context._dist_tensors_for_program.values(): + _copy_tensor_dist_attr_to_cpp( + dist_tensor.serial_tensor.dist_attr, dist_tensor.dist_attr + ) + + for dist_op in dist_context._dist_ops_for_program.values(): + _copy_op_dist_attr_to_cpp( + dist_op.serial_op.dist_attr, dist_op.dist_attr + ) + + +def _copy_dist_attr_from_cpp(dist_context): + for dist_tensor in dist_context._dist_tensors_for_program.values(): + _copy_tensor_dist_attr_from_cpp( + dist_tensor.serial_tensor.dist_attr, dist_tensor.dist_attr + ) + + for dist_op in dist_context._dist_ops_for_program.values(): + _copy_op_dist_attr_from_cpp( + dist_op.serial_op.dist_attr, dist_op.dist_attr + ) + + +def _copy_dist_attr_to_cpp_for_graph(dist_context): + for node in dist_context.serial_ordered_nodes: + if node.is_var() and node.var() is not None: + py_dist_attr = dist_context.get_tensor_dist_attr_for_graph(node) + cpp_dist_attr = node.var().dist_attr + _copy_tensor_dist_attr_to_cpp(cpp_dist_attr, py_dist_attr) + if node.is_op() and node.op() is not None: + py_dist_attr = dist_context.get_op_dist_attr_for_graph(node) + cpp_dist_attr = node.op().dist_attr + _copy_op_dist_attr_to_cpp(cpp_dist_attr, py_dist_attr) + + +def _copy_dist_attr_from_cpp_for_graph(dist_context): + for node in dist_context.serial_ordered_nodes: + if node.is_var() and node.var() is not None: + py_dist_attr = dist_context.get_tensor_dist_attr_for_graph(node) + cpp_dist_attr = node.var().dist_attr + _copy_tensor_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr) + if node.is_op() and node.op() is not None: + py_dist_attr = dist_context.get_op_dist_attr_for_graph(node) + cpp_dist_attr = node.op().dist_attr + _copy_op_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 2cceeabfc4c23..37645ef4ab326 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2612,7 +2612,7 @@ def attr_names(self): """Get the names of all attributes defined.""" return self.desc.attr_names() - def _get_attr(self, name): + def attr(self, name): """ Get the attribute by name. diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index bd6ccfd3922c8..c538ae126f016 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -103,6 +103,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_cluster_v2 MODULES test_cluster_v2) py_test_modules(test_process_mesh_v2 MODULES test_process_mesh_v2) py_test_modules(test_dist_attr_v2 MODULES test_dist_attr_v2) + py_test_modules(test_serialization MODULES test_serialization) py_test_modules(test_lr_grad_clip MODULES test_lr_grad_clip) py_test_modules(test_dist_matmul MODULES test_dist_matmul) py_test_modules(test_process_mesh MODULES test_process_mesh) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_attr_v2.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_attr_v2.py index acff242fd8c34..2c32fcf2942b8 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_attr_v2.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_attr_v2.py @@ -13,15 +13,188 @@ # limitations under the License import unittest +import copy + import paddle +import numpy as np +import paddle.nn as nn import paddle.static as static +import paddle.nn.functional as F + +from paddle.distributed import fleet +from paddle.distributed.fleet import auto +from paddle.distributed.auto_parallel.dist_context import ( + DistributedContext, + set_default_distributed_context, +) +from paddle.distributed.auto_parallel.utils import ( + _copy_dist_attr_to_cpp, + _copy_dist_attr_from_cpp, + _copy_dist_attr_to_cpp_for_graph, + _copy_dist_attr_from_cpp_for_graph, +) + from paddle.fluid.core import TensorDistAttr from paddle.fluid.core import OperatorDistAttr - from paddle.distributed.auto_parallel.process_mesh_v2 import ProcessMesh paddle.enable_static() +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = auto.ProcessMesh(mesh=[[0, 1], [2, 3]], dim_names=['x', 'y']) + + +class MLPLayer(nn.Layer): + def __init__( + self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02, + ): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + param_initializer = nn.initializer.Normal( + mean=0.0, std=initializer_range + ) + + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.linear0 = nn.Linear( + d_model, + dim_feedforward, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None, + ) + self.linear1 = nn.Linear( + dim_feedforward, + d_model, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None, + ) + + def forward(self, input): + out = self.norm(input) + auto.shard_tensor( + self.linear0.weight, + process_mesh=_g_process_mesh[0], + shard_spec=[None, 'y'], + ) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + auto.shard_tensor( + self.linear1.weight, + process_mesh=_g_process_mesh[1], + shard_spec=['y', None], + ) + out = self.linear1(out) + + return out + + +def get_random_inputs_and_labels(input_shape, label_shape): + input = np.random.random(size=input_shape).astype('float32') + label = np.random.random(size=label_shape).astype('float32') + return input, label + + +def batch_generator_creator(): + def __reader__(): + for _ in range(batch_size): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, sequence_len, hidden_size], + [batch_size, sequence_len, 1], + ) + yield batch_input, batch_label + + return __reader__ + + +def get_program(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + + # input + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32', + ) + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32' + ) + data_holder = [input, label] + # dataloader + dataloader = paddle.io.DataLoader.from_generator( + feed_list=data_holder, capacity=4 * batch_size, iterable=False + ) + dataloader.set_batch_generator( + batch_generator_creator(), places=paddle.static.cuda_places() + ) + # data dist_attr + auto.shard_tensor( + input, process_mesh=_g_process_mesh[0], shard_spec=['y', None, None] + ) + auto.shard_tensor( + label, process_mesh=_g_process_mesh[0], shard_spec=['y', None, None] + ) + + mlp_start = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + pred = mlp_start(input) + + mlp_mid = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + pred = mlp_mid(pred) + + mlp_end = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + pred = mlp_end(pred) + + error_cost = paddle.nn.functional.square_error_cost(pred, label) + loss = paddle.mean(error_cost) + + optimizer = paddle.optimizer.Adam( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None, + ) + + feed_vars = {"inputs": [input], "labels": [label]} + fetch_vars = {"loss": [loss]} + + return ( + train_program, + start_program, + dataloader, + loss, + optimizer, + feed_vars, + fetch_vars, + ) + class TestDistAttr(unittest.TestCase): def test_tensor_dist_attr_ctor(self): @@ -102,23 +275,25 @@ def test_operator_dist_attr_ctor(self): op_dist_attr.set_output_dist_attr(output.name, output_dist_attr) self.assertEqual(op_dist_attr.process_mesh, process_mesh) self.assertEqual( - op_dist_attr.input_dist_attr(input.name).process_mesh, process_mesh + op_dist_attr.get_input_dist_attr(input.name).process_mesh, + process_mesh, ) self.assertEqual( - op_dist_attr.input_dist_attr(input1.name).process_mesh, process_mesh + op_dist_attr.get_input_dist_attr(input1.name).process_mesh, + process_mesh, ) self.assertEqual( - op_dist_attr.output_dist_attr(output.name).process_mesh, + op_dist_attr.get_output_dist_attr(output.name).process_mesh, process_mesh, ) self.assertEqual( - op_dist_attr.input_dist_attr(input.name).dims_mapping, [0, -1] + op_dist_attr.get_input_dist_attr(input.name).dims_mapping, [0, -1] ) self.assertEqual( - op_dist_attr.input_dist_attr(input1.name).dims_mapping, [-1, 1] + op_dist_attr.get_input_dist_attr(input1.name).dims_mapping, [-1, 1] ) self.assertEqual( - op_dist_attr.output_dist_attr(output.name).dims_mapping, [0, 1] + op_dist_attr.get_output_dist_attr(output.name).dims_mapping, [0, 1] ) self.assertTrue(op_dist_attr.verify()) self.assertTrue(str(op_dist_attr), str(op_dist_attr)) @@ -126,13 +301,13 @@ def test_operator_dist_attr_ctor(self): op_dist_attr = OperatorDistAttr(op.desc) op_dist_attr.process_mesh = process_mesh # Set the distributed attribute of input directly - input_dist_attr = op_dist_attr.input_dist_attr(input.name) + input_dist_attr = op_dist_attr.get_input_dist_attr(input.name) input_dist_attr.dims_mapping = [-1, 0] # Set the distributed attribute of input1 directly - input1_dist_attr = op_dist_attr.input_dist_attr(input1.name) + input1_dist_attr = op_dist_attr.get_input_dist_attr(input1.name) input1_dist_attr.dims_mapping = [0, -1] # Set the distributed attribute of output directly - output_dist_attr = op_dist_attr.output_dist_attr(output.name) + output_dist_attr = op_dist_attr.get_output_dist_attr(output.name) output_dist_attr.dims_mapping = [-1, -1] self.assertEqual(op_dist_attr.process_mesh, process_mesh) self.assertEqual(input_dist_attr.process_mesh, process_mesh) @@ -171,22 +346,25 @@ def test_operator_dist_attr(self): self.assertEqual(op.desc.dist_attr.process_mesh, process_mesh) self.assertEqual( - op.dist_attr.input_dist_attr(input.name).process_mesh, process_mesh + op.dist_attr.get_input_dist_attr(input.name).process_mesh, + process_mesh, ) self.assertEqual( - op.dist_attr.input_dist_attr(input1.name).process_mesh, process_mesh + op.dist_attr.get_input_dist_attr(input1.name).process_mesh, + process_mesh, ) self.assertEqual( - op.dist_attr.input_dist_attr(input.name).dims_mapping, [0, -1] + op.dist_attr.get_input_dist_attr(input.name).dims_mapping, [0, -1] ) self.assertEqual( - op.dist_attr.input_dist_attr(input.name).dims_mapping, [0, -1] + op.dist_attr.get_input_dist_attr(input.name).dims_mapping, [0, -1] ) self.assertEqual( - op.desc.dist_attr.input_dist_attr(input1.name).dims_mapping, [-1, 1] + op.desc.dist_attr.get_input_dist_attr(input1.name).dims_mapping, + [-1, 1], ) self.assertEqual( - op.dist_attr.output_dist_attr(output.name).dims_mapping, [0, 1] + op.dist_attr.get_output_dist_attr(output.name).dims_mapping, [0, 1] ) self.assertTrue(op.desc.dist_attr.verify()) self.assertTrue(str(op_dist_attr), str(op_dist_attr)) @@ -195,5 +373,80 @@ def test_operator_dist_attr(self): self.assertEqual(op.desc.dist_attr, OperatorDistAttr(op.desc)) +class TestDistAttrConversion(unittest.TestCase): + def test_dist_attr_conversion_for_program(self): + set_default_distributed_context(DistributedContext()) + ( + train_program, + start_program, + dataloader, + loss, + optimizer, + feed_vars, + fetch_vars, + ) = get_program() + dist_context = DistributedContext( + train_program, start_program, optimizer, loss, feed_vars, fetch_vars + ) + dist_context.initialize() + original_dist_tensors = copy.deepcopy( + dist_context._dist_tensors_for_program + ) + original_dist_ops = copy.deepcopy(dist_context._dist_ops_for_program) + + _copy_dist_attr_to_cpp(dist_context) + _copy_dist_attr_from_cpp(dist_context) + + for dist_tensor in dist_context._dist_tensors_for_program.values(): + original_dist_tensor = original_dist_tensors[ + dist_tensor.serial_tensor.desc.original_id() + ] + self.assertEqual( + dist_tensor.dist_attr, original_dist_tensor.dist_attr + ) + + for dist_op in dist_context._dist_ops_for_program.values(): + original_dist_op = original_dist_ops[ + dist_op.serial_op.desc.original_id() + ] + self.assertEqual(dist_op.dist_attr, original_dist_op.dist_attr) + + def test_dist_attr_conversion_for_graph(self): + set_default_distributed_context(DistributedContext()) + ( + train_program, + start_program, + dataloader, + loss, + optimizer, + feed_vars, + fetch_vars, + ) = get_program() + dist_context = DistributedContext( + train_program, start_program, optimizer, loss, feed_vars, fetch_vars + ) + dist_context.initialize() + original_dist_tensors = copy.deepcopy( + dist_context._dist_tensors_for_graph + ) + original_dist_ops = copy.deepcopy(dist_context._dist_ops_for_graph) + + _copy_dist_attr_to_cpp_for_graph(dist_context) + _copy_dist_attr_from_cpp_for_graph(dist_context) + + for ( + node_id, + dist_tensor, + ) in dist_context._dist_tensors_for_graph.items(): + original_dist_tensor = original_dist_tensors[node_id] + self.assertEqual( + dist_tensor.dist_attr, original_dist_tensor.dist_attr + ) + + for node_id, dist_op in dist_context._dist_ops_for_graph.items(): + original_dist_op = original_dist_ops[node_id] + self.assertEqual(dist_op.dist_attr, original_dist_op.dist_attr) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_serialization.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_serialization.py new file mode 100644 index 0000000000000..343320b4b3e06 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_serialization.py @@ -0,0 +1,287 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import unittest + +import paddle +import numpy as np +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +from paddle.fluid.framework import Program + +import paddle.distributed.fleet as fleet +from paddle.distributed.fleet import auto +from paddle.distributed.auto_parallel.dist_context import ( + DistributedContext, + set_default_distributed_context, +) + +from paddle.fluid.core import TensorDistAttr +from paddle.distributed.auto_parallel.process_mesh_v2 import ProcessMesh + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = auto.ProcessMesh(mesh=[[0, 1], [2, 3]], dim_names=['x', 'y']) + + +class MLPLayer(nn.Layer): + def __init__( + self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02, + ): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + param_initializer = nn.initializer.Normal( + mean=0.0, std=initializer_range + ) + + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.linear0 = nn.Linear( + d_model, + dim_feedforward, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None, + ) + self.linear1 = nn.Linear( + dim_feedforward, + d_model, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None, + ) + + def forward(self, input): + out = self.norm(input) + auto.shard_tensor( + self.linear0.weight, + process_mesh=_g_process_mesh[0], + shard_spec=[None, 'y'], + ) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + auto.shard_tensor( + self.linear1.weight, + process_mesh=_g_process_mesh[1], + shard_spec=['y', None], + ) + out = auto.shard_op(self.linear1, process_mesh=_g_process_mesh)(out) + + return out + + +def get_random_inputs_and_labels(input_shape, label_shape): + input = np.random.random(size=input_shape).astype('float32') + label = np.random.random(size=label_shape).astype('float32') + return input, label + + +def batch_generator_creator(): + def __reader__(): + for _ in range(batch_size): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, sequence_len, hidden_size], + [batch_size, sequence_len, 1], + ) + yield batch_input, batch_label + + return __reader__ + + +def get_program(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + + # input + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32', + ) + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32' + ) + data_holder = [input, label] + # dataloader + dataloader = paddle.io.DataLoader.from_generator( + feed_list=data_holder, capacity=4 * batch_size, iterable=False + ) + dataloader.set_batch_generator( + batch_generator_creator(), places=paddle.static.cuda_places() + ) + # data dist_attr + auto.shard_tensor( + input, process_mesh=_g_process_mesh[0], shard_spec=['y', None, None] + ) + auto.shard_tensor( + label, process_mesh=_g_process_mesh[0], shard_spec=['y', None, None] + ) + + mlp_start = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + pred = mlp_start(input) + + mlp_mid = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + pred = mlp_mid(pred) + + mlp_end = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02, + ) + pred = mlp_end(pred) + + error_cost = paddle.nn.functional.square_error_cost(pred, label) + loss = paddle.mean(error_cost) + + optimizer = paddle.optimizer.Adam( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None, + ) + + feed_vars = {"inputs": [input], "labels": [label]} + fetch_vars = {"loss": [loss]} + + return ( + train_program, + start_program, + dataloader, + loss, + optimizer, + feed_vars, + fetch_vars, + ) + + +class TestDistAttrSerialization(unittest.TestCase): + def test_serialization_tensor(self): + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + input = static.data(name="input", shape=[2, 3], dtype='float32') + dist_attr = input.dist_attr + dist_attr.process_mesh = ProcessMesh([[0, 1, 2], [3, 4, 5]]) + dist_attr.dims_mapping = [0, -1] + dist_attr.batch_dim = 1 + dist_attr.dynamic_dims = [1, 1] + dist_attr_data = dist_attr.serialize_to_string() + + def test_serialization_opearator(self): + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + input = static.data(name="input", shape=[2, 3], dtype='float32') + input1 = static.data(name="input1", shape=[3, 4], dtype='float32') + output = paddle.matmul(input, input1) + op = train_program.current_block().ops[0] + process_mesh = ProcessMesh([[0, 1, 2], [3, 4, 5]]) + op_dist_attr = op.dist_attr + + op_dist_attr.process_mesh = process_mesh + # Set the distributed attribute of input + input_dist_attr = TensorDistAttr(input.desc) + input_dist_attr.dims_mapping = [0, -1] + op_dist_attr.set_input_dist_attr(input.name, input_dist_attr) + # Set the distributed attribute of input1 + input1_dist_attr = TensorDistAttr(input1.desc) + input1_dist_attr.dims_mapping = [-1, 1] + op_dist_attr.set_input_dist_attr(input1.name, input1_dist_attr) + # Set the distributed attribute of output + output_dist_attr = TensorDistAttr(output.desc) + output_dist_attr.dims_mapping = [0, 1] + op_dist_attr.set_output_dist_attr(output.name, output_dist_attr) + + def test_serialization_program(self): + set_default_distributed_context(DistributedContext()) + ( + train_program, + start_program, + dataloader, + loss, + optimizer, + feed_vars, + fetch_vars, + ) = get_program() + dist_context = DistributedContext( + train_program, start_program, optimizer, loss, feed_vars, fetch_vars + ) + dist_context.initialize(with_cpp=True) + + # Distribute context will clone the original train program to serial_main_program + original_program = dist_context.serial_main_program + for block in original_program.blocks: + for tensor in block.vars.values(): + dist_attr_data = tensor.dist_attr.serialize_to_string() + tensor._set_attr("dist_attr", dist_attr_data) + for op in block.ops: + dist_attr_data = op.dist_attr.serialize_to_string() + op._set_attr("dist_attr", dist_attr_data) + + program_data = original_program.desc.serialize_to_string() + program = Program.parse_from_string(program_data) + + for block in program.blocks: + for tensor in block.vars.values(): + dist_attr_data = tensor.attr("dist_attr") + tensor._remove_attr("dist_attr") + tensor.dist_attr.parse_from_string(dist_attr_data) + for op in block.ops: + dist_attr_data = op.attr("dist_attr") + op._remove_attr("dist_attr") + op.dist_attr.parse_from_string(dist_attr_data) + + self.assertEqual(len(original_program.blocks), len(program.blocks)) + for original_block, block in zip( + original_program.blocks, program.blocks + ): + self.assertEqual( + len(original_block.vars.values()), len(block.vars.values()) + ) + for original_tensor in original_block.vars.values(): + self.assertEqual( + original_tensor.dist_attr, + block.vars[original_tensor.name].dist_attr, + ) + self.assertEqual(len(original_block.ops), len(block.ops)) + for original_op, op in zip(original_block.ops, block.ops): + self.assertEqual(original_op.dist_attr, op.dist_attr) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py index a83574b10013f..834222be8cd1f 100644 --- a/python/paddle/fluid/tests/unittests/test_program.py +++ b/python/paddle/fluid/tests/unittests/test_program.py @@ -19,6 +19,8 @@ import paddle.fluid.layers as layers import paddle.fluid as fluid +paddle.enable_static() + main_program = default_main_program() @@ -228,15 +230,13 @@ def test_update_var(self): b = program.desc.serialize_to_string() self.assertFalse(a == b) - # it seems the attrs of framework::VarDesc is not write to proto, - # except for persistable/need_check_feed/is_parameter/stop_gradient def test_update_var_attr(self): program = build_program() a = program.desc.serialize_to_string() program.current_block().var("x").desc._set_attr("a", 1) - self.assertFalse(program.desc.need_update()) + self.assertTrue(program.desc.need_update()) b = program.desc.serialize_to_string() - self.assertTrue(a == b) # not affected + self.assertFalse(a == b) class TestProgramHash(unittest.TestCase): From 520adc0e4962c544af1b2f5327b1be71c397c571 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Mon, 31 Oct 2022 13:00:35 +0800 Subject: [PATCH 39/91] optimize: vit 384 (#47432) * optimize: vit 384 * fix:bug * fix:bug * fix:supoort rocm complie * refactor:name * fix:support rocm * fix:__HIP_NO_HALF_CONVERSIONS__ * optimize: delete scalar * fix:rocm can't support * fix:ernie error --- .../tensorrt/plugin/qkv_to_context_plugin.cu | 13 +- .../operators/fused/multihead_matmul_op.cu | 2 + .../operators/math/bert_encoder_functor.cu | 294 +++++++++++++++++- .../operators/math/bert_encoder_functor.h | 1 + 4 files changed, 299 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 336fcb5531799..27e40985d95f0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -63,7 +63,7 @@ template __global__ void reset_qk_bias(T *input, int real_seq_len, int seq_len) { if (threadIdx.x < seq_len) { int id = threadIdx.x + blockIdx.x * seq_len; - input[id] = threadIdx.x >= real_seq_len ? (T)-1e20f : (T)0.0f; + input[id] = threadIdx.x >= real_seq_len ? (T)0.0f : (T)1.0f; } } @@ -292,8 +292,9 @@ void QkvToContextPluginDynamic::configurePlugin( const phi::GPUContext &dev_ctx = *device_ctx; auto stream = dev_ctx.stream(); tensor_.Resize({batch, seq_len, seq_len, head_number_}); - int blocks = batch * head_number_ * seq_len; if (in[0].desc.type == nvinfer1::DataType::kHALF) { + tensor_.Resize({batch, seq_len, seq_len, 1}); + int blocks = batch * 1 * seq_len; mask_half_ = reinterpret_cast( tensor_.mutable_data(platform::CUDAPlace(device_id))); reset_qk_bias<<>>( @@ -462,6 +463,7 @@ int QkvToContextPluginDynamic::enqueue( head_size_, qkptr, input1_data, + false, tptr, scale_, static_cast(0.0)); @@ -510,10 +512,12 @@ int QkvToContextPluginDynamic::enqueue( head_number_); qk_bias = temp_qk_bias; } - // padding: mask_half_ = [0,0,...-1e20f,-1e20f] - // no_padding: mask_half_ = [0,.....0,.........,0] + // padding: mask_half_ = [1.0,....1.0...1.0....,0.0f] + // no_padding: mask_half_ = [1.0,....1.0,.........,1.0f] + bool bias_is_mask = false; if (ProductDim(input_desc[1].dims) == ProductDim(input_desc[0].dims)) { qk_bias = mask_half_; + bias_is_mask = true; } const half *input1_data = static_cast(qk_bias); // BxSx3xNxH => tptr: 3xBxNxSxH. @@ -552,6 +556,7 @@ int QkvToContextPluginDynamic::enqueue( head_size_, qkptr, input1_data, + bias_is_mask, tptr, half(1.), half(0.0)); diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index a258c0107859c..f1deedce5f133 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -365,6 +365,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { head_size, reinterpret_cast(qkptr), reinterpret_cast(bias_qk_d), + false, reinterpret_cast(tptr), __float2half(static_cast(scale)), __float2half(0.0)); @@ -377,6 +378,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { head_size, qkptr, bias_qk_d, + false, tptr, scale, T(0.0)); diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index d3fccf697ab3c..18e5ee845d26e 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -532,6 +532,257 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(half2 *qk_buf_, #endif } +template +inline __device__ T ldg(const T *val) { + return __ldg(val); +} + +template +inline __device__ T hexp2(T a) { + return h2exp(a); +} + +template +inline __device__ T_OUT type2type2(T_IN a); + +template <> +inline __device__ half2 type2type2(half a) { + return __half2half2(a); +} + +template +inline __device__ T float2type2(float a); + +template <> +inline __device__ half2 float2type2(float a) { + return __float2half2_rn(a); +} + +template +inline __device__ T hmul2(T a, T b) { + return __hmul2(a, b); +} + +template +inline __device__ T hsub2(T a, T b) { + return __hsub2(a, b); +} + +template +inline __device__ T hadd2(T a, T b) { + return __hadd2(a, b); +} + +template +__inline__ __device__ T warpReduceSumV2(T *val) { +#pragma unroll + for (int i = 0; i < NUM; i++) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) + val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32); + } + return (T)(0.0f); +} + +template +__inline__ __device__ T blockReduceSumV2(T *val) { + static __shared__ T shared[NUM][33]; + int lane = threadIdx.x & 0x1f; + int wid = threadIdx.x >> 5; + + warpReduceSumV2(val); + + if (lane == 0) { +#pragma unroll + for (int i = 0; i < NUM; i++) { + shared[i][wid] = val[i]; + } + } + + __syncthreads(); + + bool is_mask = threadIdx.x < (blockDim.x / 32.f); +#pragma unroll + for (int i = 0; i < NUM; i++) { + val[i] = is_mask ? shared[i][lane] : (T)(0.0f); + } + warpReduceSumV2(val); + return (T)0.0f; +} + +template +__inline__ __device__ T warpReduceMaxV2(T *val) { +#pragma unroll + for (int i = 0; i < NUM; i++) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) + val[i] = max(val[i], __shfl_xor_sync(FINAL_MASK, val[i], mask, 32)); + } + return (T)(0.0f); +} + +template +__inline__ __device__ T blockReduceMaxV2(T *val) { + static __shared__ T shared[32][NUM]; + int lane = threadIdx.x & 0x1f; // in-warp idx + int wid = threadIdx.x >> 5; // warp idx + + warpReduceMaxV2(val); // get maxx in each warp + + if (lane == 0) { +#pragma unroll + for (int i = 0; i < NUM; i++) { + shared[wid][i] = val[i]; + } + } + + __syncthreads(); + + // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent + // blockDim.x is not divided by 32 + bool is_mask = threadIdx.x < (blockDim.x / 32.f); +#pragma unroll + for (int i = 0; i < NUM; i++) { + val[i] = is_mask ? shared[lane][i] : (T)-1e20f; + } + warpReduceMaxV2(val); + + return (T)0.0f; +} + +template +__global__ void softmax_kernel_with_mask(T *qk_buf_, + const T *attr_mask, + const int batch_size, + const int head_num, + const int seq_len) { + using T2 = half2; + T2 *qk_buf_half2 = reinterpret_cast(qk_buf_); + const T2 *attr_mask_half2 = (const T2 *)attr_mask; + + for (int seq_id = blockIdx.x; seq_id < seq_len; seq_id += gridDim.x * NUM) { + T2 data[NUM][ITEMS_PER_THREAD]; + + int qk_offset[NUM]; + + __shared__ float s_sum[NUM], s_max[NUM]; + float local_max[NUM]; +#pragma unroll + for (int j = 0; j < NUM; j++) { + local_max[j] = -1e20f; + } + + for (int i = 0; + blockDim.x * i + threadIdx.x < (seq_len / 2) && i < ITEMS_PER_THREAD; + i++) { + int mask_offset[NUM]; +#pragma unroll + for (int j = 0; j < NUM; j++) { + qk_offset[j] = ((blockIdx.y * head_num + blockIdx.z) * seq_len + + seq_id + j * gridDim.x) * + (seq_len / 2) + + blockDim.x * i + threadIdx.x; + mask_offset[j] = + (blockIdx.y * seq_len + seq_id + j * gridDim.x) * (seq_len / 2) + + blockDim.x * i + threadIdx.x; + } + + T2 mask_val[NUM]; +#pragma unroll + for (int j = 0; j < NUM; j++) { + mask_val[j] = ldg(&attr_mask_half2[mask_offset[j]]); + } + + T2 qk[NUM]; +#pragma unroll + for (int j = 0; j < NUM; j++) { + qk[j] = qk_buf_half2[qk_offset[j]]; + } + +#pragma unroll + for (int j = 0; j < NUM; j++) { + mask_val[j] = hmul2(hsub2(float2type2(1.0f), mask_val[j]), + float2type2(-10000.0f)); + } + +#pragma unroll + for (int j = 0; j < NUM; j++) { + data[j][i] = hadd2(qk[j], mask_val[j]); + local_max[j] = fmax(local_max[j], + fmax(static_cast(data[j][i].x), + static_cast(data[j][i].y))); + } + } + + if (blockDim.x <= 32) { + warpReduceMaxV2(local_max); + } else { + blockReduceMaxV2(local_max); + } + + if (threadIdx.x == 0) { +#pragma unroll + for (int j = 0; j < NUM; j++) { + s_max[j] = local_max[j]; + } + } + __syncthreads(); + + float local_sum[NUM]; +#pragma unroll + for (int j = 0; j < NUM; j++) { + local_sum[j] = {0.f}; + } + + for (int i = 0; + blockDim.x * i + threadIdx.x < (seq_len / 2) && i < ITEMS_PER_THREAD; + i++) { +#pragma unroll + for (int j = 0; j < NUM; j++) { + data[j][i] = + hexp2(hsub2(data[j][i], float2type2(s_max[j]))); + } + +#pragma unroll + for (int j = 0; j < NUM; j++) { + local_sum[j] += static_cast(data[j][i].x + data[j][i].y); + } + } + + if (blockDim.x <= 32) { + warpReduceSumV2(local_sum); + } else { + blockReduceSumV2(local_sum); + } + + if (threadIdx.x == 0) { +#pragma unroll + for (int j = 0; j < NUM; j++) { + s_sum[j] = __fdividef(1.0f, local_sum[j] + 1e-6f); + } + } + __syncthreads(); + + for (int i = 0; + blockDim.x * i + threadIdx.x < (seq_len / 2) && i < ITEMS_PER_THREAD; + i++) { +#pragma unroll + for (int j = 0; j < NUM; j++) { + qk_offset[j] = ((blockIdx.y * head_num + blockIdx.z) * seq_len + + seq_id + j * gridDim.x) * + (seq_len / 2) + + blockDim.x * i + threadIdx.x; + } + +#pragma unroll + for (int j = 0; j < NUM; j++) { + qk_buf_half2[qk_offset[j]] = + hmul2(data[j][i], float2type2(s_sum[j])); + } + } + } +} + template inline void MatMulWithHeadQK(const phi::GPUContext &context, int head_num, @@ -544,6 +795,7 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, T *k_buf_, T *qk_buf_, const T *bias_qk, + bool bias_is_mask, T alpha, T beta) { CBLAS_TRANSPOSE transA = !q_trans ? CblasNoTrans : CblasTrans; @@ -583,13 +835,39 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, seq_len / 2, FINAL_MASK); } else { - SoftmaxKernelWithEltadd2<__half2><<>>( - reinterpret_cast<__half2 *>(qk_buf_), - reinterpret_cast(bias_qk), - batch_size, - head_num, - seq_len / 2, - FINAL_MASK); + if (bias_is_mask) { +#ifndef __HIPCC__ + constexpr int ITEMS_PER_THREAD = 1; + bool is_half2 = true; + + dim3 grid(seq_len, batch_size, head_num); + dim3 block((seq_len / 2 + 31) / 32 * 32); + block.x /= ITEMS_PER_THREAD; + assert(block.x <= 1024); + assert(grid.x % 4 == 0); + grid.x /= 4; + constexpr int NUM = 4; + softmax_kernel_with_mask + <<>>(reinterpret_cast(qk_buf_), + (const half *)bias_qk, + batch_size, + head_num, + seq_len); +#else + PADDLE_ENFORCE_EQ(bias_is_mask, + false, + platform::errors::InvalidArgument( + "rocm can't support that QK_bias is mask")); +#endif + } else { + SoftmaxKernelWithEltadd2<__half2><<>>( + reinterpret_cast<__half2 *>(qk_buf_), + reinterpret_cast(bias_qk), + batch_size, + head_num, + seq_len / 2, + FINAL_MASK); + } } } else { block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32; @@ -669,6 +947,7 @@ void MultiHeadGPUComputeFunctor::operator()(const phi::GPUContext &dev_ctx, int head_size, T *qkptr, const T *bias_qk_ptr, + bool bias_is_mask, T *tptr, T alpha, T beta) { @@ -690,6 +969,7 @@ void MultiHeadGPUComputeFunctor::operator()(const phi::GPUContext &dev_ctx, kptr, qkptr, bias_qk_ptr, + bias_is_mask, alpha, beta); // batch gemm stride, transpose. diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 6676da9bb8dd8..9a0b5a1ae3ab7 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -100,6 +100,7 @@ class MultiHeadGPUComputeFunctor { int head_size, T *qkptr, const T *bias_qk_ptr, + bool bias_is_mask, T *tptr, T alpha, T beta); From 34d13d6abb4ddf9ef407a61f3efc1894989d8bfa Mon Sep 17 00:00:00 2001 From: ronnywang Date: Mon, 31 Oct 2022 14:06:28 +0800 Subject: [PATCH 40/91] [CustomDevice] GetCCLComm add custom device support (#47168) * [CustomDevice] GetCCLComm add custom device support * update * update * update --- .../distributed/collective/CMakeLists.txt | 9 +-- .../collective/ProcessGroupCustom.cc | 18 +++-- .../collective/ProcessGroupCustom.h | 2 + paddle/phi/backends/CMakeLists.txt | 12 ++++ .../phi/backends/processgroup_comm_utils.cc | 65 +++++++++++++++++++ paddle/phi/kernels/CMakeLists.txt | 1 + .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 20 ------ 7 files changed, 96 insertions(+), 31 deletions(-) create mode 100644 paddle/phi/backends/processgroup_comm_utils.cc diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 7f6a5e262b716..aa816f26f93f0 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -86,11 +86,6 @@ if(WITH_CUSTOM_DEVICE) cc_library( processgroup_custom SRCS ProcessGroupCustom.cc CustomCCLTools.cc Common.cc - DEPS phi_backends - place - enforce - collective_helper - device_context - phi_api - eager_api) + DEPS processgroup phi_backends place enforce collective_helper + device_context) endif() diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index f18765a05f619..87bd474477eb9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -19,7 +19,6 @@ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/include/api.h" #include "paddle/phi/common/place.h" DECLARE_bool(xccl_blocking_wait); @@ -386,9 +385,10 @@ std::shared_ptr ProcessGroupCustom::Barrier( for (auto& place : places) { phi::DeviceGuard guard(place); - auto dt = full({1}, 0, phi::DataType::FLOAT32, place); - barrierTensors.push_back( - *std::dynamic_pointer_cast(dt.impl())); + phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1})); + auto allocator = std::unique_ptr( + new paddle::experimental::DefaultAllocator(place)); + barrierTensors.emplace_back(allocator.get(), meta); } auto task = ProcessGroupCustom::AllReduce(barrierTensors, barrierTensors); auto xccl_task = dynamic_cast(task.get()); @@ -396,5 +396,15 @@ std::shared_ptr ProcessGroupCustom::Barrier( return task; } +phi::ccl::CCLComm ProcessGroupCustom::CustomCCLComm(const Place& place) const { + std::vector places = {place}; + const auto& iter = places_to_customcomm_.find(GetKeyFromPlaces(places)); + PADDLE_ENFORCE_NE(iter, + places_to_customcomm_.end(), + platform::errors::InvalidArgument( + "Cannot find nccl comm in process group.")); + return iter->second[0]->GetCustomCCLComm(); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index ce3532bbb6f0e..38a794a0e70cd 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -96,6 +96,8 @@ class ProcessGroupCustom : public ProcessGroup { std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; + phi::ccl::CCLComm CustomCCLComm(const Place& place) const; + protected: virtual std::shared_ptr CreateTask( std::vector places, diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 9bc9573529241..b2095f7983f5a 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -58,3 +58,15 @@ if(WITH_CUSTOM_DEVICE) SRCS custom/capi_test.cc DEPS phi_capi) endif() + +set(COMM_UTILS_DEPS processgroup) +if(WITH_NCCL OR WITH_RCCL) + set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_nccl) +endif() +if(WITH_CUSTOM_DEVICE) + set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_custom) +endif() +cc_library( + processgroup_comm_utils + SRCS processgroup_comm_utils.cc + DEPS ${COMM_UTILS_DEPS}) diff --git a/paddle/phi/backends/processgroup_comm_utils.cc b/paddle/phi/backends/processgroup_comm_utils.cc new file mode 100644 index 0000000000000..580aebd17e6d5 --- /dev/null +++ b/paddle/phi/backends/processgroup_comm_utils.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/phi/backends/c_comm_lib.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#endif +#if defined(PADDLE_WITH_CUSTOM_DEVICE) +#include "paddle/fluid/distributed/collective/ProcessGroupCustom.h" +#endif + +namespace phi { +namespace detail { + +// FIXME(paddle-dev): Since the singleton of ProcessGroup in fluid is used in +// SyncBN, the fluid symbol will be dependent on external hardware access. +// Here, the part that depends on the fluid symbol is individually encapsulated +// as a temporary function to isolate external symbol dependencies. +// In the future, the dependence on the singleton in fluid in SyncBN needs +// to be removed. +// In principle, the PHI Kernel cannot use the global singleton internally, +// and the required members need to be passed in from the eucalyptus tree. +ccl::CCLComm GetCCLComm(const Place& place, int global_gid) { + paddle::distributed::ProcessGroup* pg = nullptr; + if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( + global_gid)) { + pg = paddle::distributed::ProcessGroupMapFromGid::getInstance()->get( + global_gid); + } else { + return nullptr; + } + + if (paddle::platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + return static_cast(pg)->NCCLComm( + place); +#else + return nullptr; +#endif + } else if (paddle::platform::is_custom_place(place)) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + return static_cast(pg) + ->CustomCCLComm(place); +#else + return nullptr; +#endif + } else { + return nullptr; + } +} + +} // namespace detail +} // namespace phi diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 7cbd218543d65..8e45da27a806a 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -83,6 +83,7 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup) if(WITH_NCCL OR WITH_RCCL) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl) endif() +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index 106b3d66427a8..d41f50677fdf5 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -18,26 +18,6 @@ #include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h" namespace phi { -namespace detail { - -ccl::CCLComm GetCCLComm(const Place &place, int global_gid) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ncclComm_t comm = nullptr; - - if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( - global_gid)) { - auto *nccl_pg = static_cast( - paddle::distributed::ProcessGroupMapFromGid::getInstance()->get( - global_gid)); - comm = nccl_pg->NCCLComm(place); - } - return comm; -#else - return nullptr; -#endif -} - -} // namespace detail template void SyncBatchNormKernel(const Context &ctx, From 2953b708a03d023b6b6b1fecde7ac431f8f48a94 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Mon, 31 Oct 2022 14:23:40 +0800 Subject: [PATCH 41/91] feat: add int8 support for vit (#47330) * feat: add int8 support for vit * test:add test --- .../framework/ir/vit_attention_fuse_pass.cc | 26 +++++++++++++ .../tensorrt/convert/multihead_matmul_op.cc | 38 +++++++++++++++---- .../test_trt_convert_multihead_matmul.py | 12 +++++- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc index 4f61b0301081e..3ff91e0bcb76c 100644 --- a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc +++ b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc @@ -56,6 +56,22 @@ namespace paddle { namespace framework { namespace ir { +bool HasScale(OpDesc* const op_ptr, + std::string* name, + std::string regexp = "Input_scale_") { + name->clear(); + std::unordered_map attr_map = op_ptr->GetAttrMap(); + std::unordered_map::iterator iter; + int len = regexp.size(); + for (iter = attr_map.begin(); iter != attr_map.end(); iter++) { + if (regexp == iter->first.substr(0, len)) { + *name = iter->first; + return true; + } + } + return false; +} + void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const { GraphPatternDetector gpd; const std::string pattern_name = "vit_attention_fuse"; @@ -103,6 +119,16 @@ void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const { float alpha = PADDLE_GET_CONST(float, scale1_op->Op()->GetAttr("scale")); desc.SetAttr("alpha", alpha); + // int8 for fc + std::string scale_name; + if (HasScale(matmul0_op->Op(), &scale_name)) { + desc.SetAttr("Input_scale", matmul0_op->Op()->GetAttr(scale_name)); + } + if (HasScale(elementwise0_op->Op(), &scale_name, "Out")) { + desc.SetAttr("fc_out_threshold", + elementwise0_op->Op()->GetAttr(scale_name)); + } + // Create a new node for the fused op. auto vit_attention_node = graph->CreateOpNode(&desc); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index f997c8bd1f864..0515cb513d007 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -398,13 +398,37 @@ class MultiheadMatMulOpConverter : public OpConverter { // add fc layer nvinfer1::ILayer* fc_layer = nullptr; - fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, - FullyConnected, - *reshape_before_fc_layer->getOutput(0), - n, - weight, - bias); + if (op_desc.HasAttr("Input_scale")) { + engine_->SetTensorDynamicRange( + reshape_before_fc_layer->getOutput(0), in_scale); + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Convolution, + *reshape_before_fc_layer->getOutput(0), + n, + nv_ksize, + weight, + bias); + PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"), + true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers " + "in int8 mode")); + float out_scale = + PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } else { + fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + FullyConnected, + *reshape_before_fc_layer->getOutput(0), + n, + weight, + bias); + } + fc_layer->setName( + ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); // add shuffle for CustomQKVToContextPluginDynamic layer auto* reshape_after_fc_layer = diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index 6889f88fa4c95..fa1cb51e7f969 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -818,7 +818,11 @@ def generate_weight2(): "Y": ["matmul1_weight"], }, "op_outputs": {"Out": ["matmul1_output"]}, - "op_attrs": {"trans_x": False, "trans_y": False}, + "op_attrs": { + "trans_x": False, + "trans_y": False, + "Input_scale_layer": 1.0, + }, }, { "op_type": "elementwise_add", @@ -832,6 +836,7 @@ def generate_weight2(): "Scale_x": 1.0, "Scale_y": 1.0, "axis": 2, + "Out": 1.0, }, }, { @@ -1035,6 +1040,11 @@ def generate_trt_nodes_num(): # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.workspace_size = 2013265920 + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num(), ( + 1e-3, + 1e-3, + ) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num(), ( 1e-3, From de4a79119fd5a063d6e1acccfa87f9db01462a30 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 31 Oct 2022 17:14:01 +0800 Subject: [PATCH 42/91] fix predictor memory write overflow (#47485) * fix predictor memory write overflow --- paddle/fluid/inference/api/analysis_predictor.cc | 5 +++++ paddle/fluid/inference/api/api_impl.cc | 6 ++++++ .../tests/api/analyzer_transformer_tester_helper.h | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 0190d9c291b8e..8663ec7d1f09b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -182,6 +182,11 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, pt.data.data(), paddle::platform::errors::InvalidArgument( "The data contained in the input PaddleTensor is illegal.")); + PADDLE_ENFORCE_EQ( + pt.data.length(), + t->numel() * paddle::experimental::SizeOf(t->dtype()), + paddle::platform::errors::InvalidArgument( + "The data contained in the input PaddleTensor had wrong length.")); } if (platform::is_cpu_place(place)) { diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 865c08fe2f64d..1b6779ac9fbc9 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -234,6 +234,12 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, inputs[i].data.data(), platform::errors::InvalidArgument( "The data of input tensor should not be null.")); + PADDLE_ENFORCE_EQ( + inputs[i].data.length(), + input.numel() * paddle::experimental::SizeOf(input.dtype()), + paddle::platform::errors::InvalidArgument( + "The data contained in the input PaddleTensor had wrong length.")); + if (platform::is_cpu_place(place_)) { // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h index 68167c2a313a9..569d62f637ff1 100644 --- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h @@ -154,7 +154,7 @@ void PrepareInputs(std::vector *input_slots, init_idx.name = "init_idx"; init_idx.shape.assign({batch_size}); - init_idx.dtype = PaddleDType::INT32; + init_idx.dtype = PaddleDType::INT64; TensorAssignData(&init_idx, one_batch.init_idx); trg_src_attn_bias.name = "trg_src_attn_bias"; From 266283b21d76e1a5d4e36a44447befc103972ccb Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Mon, 31 Oct 2022 17:40:02 +0800 Subject: [PATCH 43/91] remove postprocess in dygraph ptq export (#47487) --- .../paddle/fluid/contrib/slim/quantization/imperative/ptq.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py index 67b7c1073e42c..febdacdf43eac 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py @@ -140,15 +140,11 @@ def save_quantized_model(self, model, path, input_spec=None, **config): assert isinstance( model, paddle.nn.Layer ), "The model must be the instance of paddle.nn.Layer." - is_postprocess = config.get('postprocess', True) - config.pop('postprocess', None) # Convert and save dygraph quantized model self._convert(model) paddle.jit.save(layer=model, path=path, input_spec=input_spec, **config) - if not is_postprocess: - return # Load inference program is_dynamic_mode = False From 6e1c14e357e4dc88a6c484cc79529aff3d8911c7 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 31 Oct 2022 17:45:56 +0800 Subject: [PATCH 44/91] [Einsum] Einsum support repeated labels. (#47290) * add unittest for einsum-v2-trace and diagonal * repeat labels. * einsum support repeated labels. * forward is ok for diagonal and undiagonalized. TODO: check backward is ok by our theorem. * backward is ok! * fix by PR suggestions. * fix ci error * fix ci error * fix ci warning --- paddle/phi/infermeta/unary.cc | 4 +- .../phi/kernels/cpu/diagonal_grad_kernel.cc | 5 +- paddle/phi/kernels/cpu/diagonal_kernel.cc | 2 + paddle/phi/kernels/diagonal_kernel.h | 14 + .../phi/kernels/fill_diagonal_tensor_kernel.h | 16 + .../phi/kernels/gpu/diagonal_grad_kernel.cu | 7 +- paddle/phi/kernels/gpu/diagonal_kernel.cu | 6 +- .../gpu/fill_diagonal_tensor_grad_kernel.cu | 1 + .../gpu/fill_diagonal_tensor_kernel.cu | 1 + paddle/phi/kernels/impl/einsum_grad_impl.h | 27 +- paddle/phi/kernels/impl/einsum_impl.h | 283 ++++++++++++------ .../fluid/tests/unittests/test_einsum_op.py | 49 +++ .../fluid/tests/unittests/test_einsum_v2.py | 128 ++++---- python/paddle/tensor/einsum.py | 8 - 14 files changed, 399 insertions(+), 152 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 150da6d59b9ff..1e4c226a9a976 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -835,6 +835,7 @@ void EinsumInferMeta(const std::vector& inputs, for (auto& i : inputs) { input_dims.push_back(i->dims()); } + std::vector input_strs; std::string right; ParseEinsumEquation(equation, input_dims, @@ -845,7 +846,8 @@ void EinsumInferMeta(const std::vector& inputs, &ellipsis_dims, &broadcast_dims, &output_dims, - &right); + &right, + &input_strs); VLOG(3) << "Einsum Infershape: input dims:" << paddle::string::join_strings(input_dims, "\n"); diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc index 5671e70c96e0a..f5d6ee2dce674 100644 --- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc @@ -90,4 +90,7 @@ PD_REGISTER_KERNEL(diagonal_grad, float, double, int, - int64_t) {} + int64_t, + bool, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc index 8ea5826ba25f7..f125802c19e24 100644 --- a/paddle/phi/kernels/cpu/diagonal_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc @@ -88,4 +88,6 @@ PD_REGISTER_KERNEL(diagonal, double, int, int64_t, + phi::dtype::complex, + phi::dtype::complex, bool) {} diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h index 2d866d4e301b1..fc8844edc981a 100644 --- a/paddle/phi/kernels/diagonal_kernel.h +++ b/paddle/phi/kernels/diagonal_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { @@ -39,4 +40,17 @@ void DiagonalKernel(const Context& dev_ctx, int axis1, int axis2, DenseTensor* out); + +template +DenseTensor Diagonal(const Context& dev_ctx, + const DenseTensor& x, + int offset, + int axis1, + int axis2) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + DiagonalInferMeta(x, offset, axis1, axis2, &meta_out); + DiagonalKernel(dev_ctx, x, offset, axis1, axis2, &dense_out); + return dense_out; +} } // namespace phi diff --git a/paddle/phi/kernels/fill_diagonal_tensor_kernel.h b/paddle/phi/kernels/fill_diagonal_tensor_kernel.h index 9d6c8da93edb5..c3fe394a7f5fa 100644 --- a/paddle/phi/kernels/fill_diagonal_tensor_kernel.h +++ b/paddle/phi/kernels/fill_diagonal_tensor_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/binary.h" namespace phi { @@ -27,6 +28,21 @@ void FillDiagonalTensorKernel(const Context& ctx, int dim2, DenseTensor* out); +template +DenseTensor FillDiagonalTensor(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int64_t offset, + int dim1, + int dim2) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + FillDiagonalTensorInferMeta(x, y, offset, dim1, dim2, &meta_out); + FillDiagonalTensorKernel( + ctx, x, y, offset, dim1, dim2, &dense_out); + return dense_out; +} + void CalMatDims(phi::DDim out_dims, int dim1, int dim2, diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu index a5c0e05959842..1fd1e446991fa 100644 --- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu @@ -166,4 +166,9 @@ PD_REGISTER_KERNEL(diagonal_grad, float, double, int, - int64_t) {} + int64_t, + bool, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu index 2e4ae5919971c..169cb3f2c78b9 100644 --- a/paddle/phi/kernels/gpu/diagonal_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu @@ -163,4 +163,8 @@ PD_REGISTER_KERNEL(diagonal, double, int, int64_t, - bool) {} + bool, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu index 0e302b23ee98c..04f03e3aae01a 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu @@ -109,6 +109,7 @@ PD_REGISTER_KERNEL(fill_diagonal_tensor_grad, int8_t, uint8_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex, bool) {} diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu index 739a8666e3143..33c06e339bfef 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu @@ -131,6 +131,7 @@ PD_REGISTER_KERNEL(fill_diagonal_tensor, int8_t, uint8_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex, bool) {} diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h index 992b7572c1be5..bf27f3ef2b1be 100644 --- a/paddle/phi/kernels/impl/einsum_grad_impl.h +++ b/paddle/phi/kernels/impl/einsum_grad_impl.h @@ -20,15 +20,20 @@ #include "paddle/utils/string/string_helper.h" namespace phi { + template DenseTensor PerformTileAndReduction(const Context& dev_ctx, const LabelMap& label2type, const LabelMap& label2shape, const std::vector& broadcast_dims, const std::vector& ellipsis_dims, - std::string op_label, // value pass - DenseTensor& t) { // NOLINT - ReplaceEllipsis(op_label); + std::string equ, // value pass + DenseTensor& t) { // NOLINT + auto tmp_label = equ; + ReplaceEllipsis(tmp_label); + auto tmp_union = unique_labels(tmp_label); + auto op_label = std::string(tmp_union.begin(), tmp_union.end()); + VLOG(5) << "Start PerformTileAndReduction" << equ; DenseTensor ret; std::vector repeat_times; std::vector resize_dims; @@ -61,6 +66,8 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx, })) { after_tile = t; } else { + VLOG(4) << "do TileKernel with repeat_times=" + << paddle::string::join_strings(repeat_times, ","); TileKernel(dev_ctx, t, repeat_times, &after_tile); } size_t n_ellipsis_idx = op_label.find(".", 0); @@ -92,7 +99,11 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx, VLOG(5) << "PermformTileAndReduction: recover shape: " << paddle::string::join_strings(recover_shape, ","); ret.Resize(make_ddim(recover_shape)); - return ret; + // undiagonalize by einsum equation. only contain undiagonal operations. + DenseTensor out; + VLOG(5) << "Undiagonal by einsum with args: " << op_label + "->" + equ; + EinsumKernel(dev_ctx, {&ret}, op_label + "->" + equ, &out); + return out; } template @@ -115,6 +126,7 @@ void EinsumGradKernel(const Context& dev_ctx, for (auto& i : x) { input_dims.push_back(i->dims()); } + std::vector input_strs; std::string right; ParseEinsumEquation(equation, input_dims, @@ -125,13 +137,15 @@ void EinsumGradKernel(const Context& dev_ctx, &ellipsis_dims, &broadcast_dims, &output_dims, - &right); + &right, + &input_strs); auto gather_labels_except_reduction = [&labeltype](std::string all) { std::string res(""); for (auto c : all) if (labeltype[static_cast(c)] != LabelType::Reduction) res += c; - return res; + auto tmp_unique = unique_labels(res); + return std::string(tmp_unique.begin(), tmp_unique.end()); }; if (x.size() == 1) { // Unary auto splits = paddle::string::split_string(equation, "->"); @@ -141,6 +155,7 @@ void EinsumGradKernel(const Context& dev_ctx, auto new_operands = std::vector(); new_operands.push_back(&out_grad); DenseTensor before_tile; + VLOG(5) << "new_equation is " << new_equation; EinsumKernel(dev_ctx, new_operands, new_equation, &before_tile); *(x_grad[0]) = PerformTileAndReduction(dev_ctx, labeltype, diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h index dafb967ae8ed5..392949e065b31 100644 --- a/paddle/phi/kernels/impl/einsum_impl.h +++ b/paddle/phi/kernels/impl/einsum_impl.h @@ -18,6 +18,9 @@ #include "glog/logging.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/diagonal_kernel.h" +#include "paddle/phi/kernels/fill_diagonal_tensor_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -89,6 +92,9 @@ class LabelMap { if (label == '.') i = N - 1; return map[i]; } + bool exist(char label) { return !is_default(label); } + + private: // non-exist is present by is_default bool is_default(char label) { return (*this)[static_cast(label)] == default_value; @@ -117,8 +123,9 @@ inline static void ReplaceEllipsis(std::string& s) { // NOLINT } } -inline std::vector union_labels(const std::vector& a, - const std::vector& b) { +template +inline std::vector union_labels(const CharIterable1& a, + const CharIterable2& b) { LabelMap counter(0); std::vector res; auto f = [&](char c) { @@ -132,6 +139,11 @@ inline std::vector union_labels(const std::vector& a, return res; } +template +inline std::vector unique_labels(const CharIterable& a) { + return union_labels(a, CharIterable()); +} + // Apply transforms to all_labels and get another all_labels inline std::vector TransformLabelsOrder( const std::vector& all_labels, @@ -160,9 +172,9 @@ inline static void GlobalInfo(const std::vector& op_labels, } for (auto& op : op_labels) { - for (auto& ch : op) { // char + for (auto& ch : unique_labels(op)) { // char int c = ch; - if (counter.is_default(c)) { + if (!counter.exist(c)) { all.push_back(ch); } counter[c] += 1; @@ -238,7 +250,7 @@ inline static void InferLabelShape(const std::vector& op_labels, v = op_dim[dim_ptr]; dim_ptr++; } - } else if (labelshape->is_default(c) || (*labelshape)[c] == -1) { + } else if (!labelshape->exist(c) || (*labelshape)[c] == -1) { (*labelshape)[c] = op_dim[dim_ptr]; dim_ptr++; } else if (op_dim[dim_ptr] != -1) { @@ -270,12 +282,15 @@ inline static void InferLabelShape(const std::vector& op_labels, << paddle::string::join_strings(*broadcast_dims, ","); } -inline static void InferLabelPerm(const std::string& op, +template +inline static void InferLabelPerm(const CharIterable& op, int n_broadcast, LabelMap* label2perm) { int cur = 0; for (int c : op) { - (*label2perm)[c] = cur; + if (!label2perm->exist( + c)) // can appear repeatly. we just record the first position. + (*label2perm)[c] = cur; if (c == '.') { cur += n_broadcast; } else { @@ -308,15 +323,21 @@ inline static void ParseEinsumEquation( std::vector>* ellipsis_dims, std::vector* broadcast_dims, std::vector* output_dims, - std::string* right) { + std::string* right, + std::vector* input_strs) { + VLOG(5) << "Start ParseEinsumEquation"; auto results = paddle::string::split_string(equation, "->"); auto left = results[0]; ReplaceEllipsis(left); *right = results[1].substr(1); ReplaceEllipsis(*right); auto op_labels = paddle::string::split_string(left, ","); - // split_string("i,") -> ["i"], we expect 2 op_labels. - if (left[left.size() - 1] == ',') op_labels.push_back(""); + // split_string("i,") -> ["i"], we push back a "". + // split_string("->") -> [], we push back a "". + if (op_labels.size() == 0) + op_labels.push_back(""); + else if (left[left.size() - 1] == ',') + op_labels.push_back(""); std::for_each(op_labels.begin(), op_labels.end(), ReplaceEllipsis); GlobalInfo(op_labels, *right, labeltype, all_labels); InferLabelShape(op_labels, inputs, labelshape, ellipsis_dims, broadcast_dims); @@ -327,8 +348,8 @@ inline static void ParseEinsumEquation( for (size_t i = 0; i < inputs.size(); ++i) { InferLabelPerm( op_labels[i], ellipsis_dims->at(i).size(), &((*label2perms)[i])); + (*input_strs).push_back(std::move(op_labels[i])); } - VLOG(5) << "Einsum Infershape: end"; } template @@ -371,20 +392,124 @@ std::vector GetShapeByType(const std::vector& all_labels, return res; } +inline static std::vector perm_moveto(int n, int from, int to) { + // a permution means moving `from` to `to`. + /* + f => t permtation + -------------------- + 0 1 2 3 4 5 + 5 => 2 : 0 2 5 2 3 4 + 2 => 5 : 0 1 3 4 5 2 + we can conclude the following rules. + */ + if (from < 0) from = n + from; + if (to < 0) to = n + to; + std::vector res(n); + for (int i = 0; i < n; ++i) { + res[i] = i; + } + res[to] = from; + auto offset = from > to ? -1 : 1; + auto start = from > to ? to + 1 : from; + auto end = from > to ? from : to - 1; + for (int i = start; i <= end; ++i) { + res[i] += offset; + } + return res; +} + template -DenseTensor PerformReduction(const Context& dev_ctx, - const DenseTensor& tensor, - const LabelMap& label2perm, - const std::vector& all_labels, - const std::vector& ellipsis, - const LabelMap& label2type) { +DenseTensor Undiagonal(const Context& dev_ctx, + const DenseTensor& tensor, + size_t insert_pos, + size_t axis) { + // tensor with shape (3, 4, 5, 2, 1), insert_pos = 5, axis = 2. + // output is (3, 4, 5, 2, 1, 5) + VLOG(5) << "Start undiagonal with args: insert_pos = " << insert_pos + << ", axis = " << axis; + std::vector shape(tensor.dims().size() + 1); + int point = 0; // point to the tensor.dims() + for (size_t i = 0; i < shape.size(); ++i) { + if (i == insert_pos) + shape[i] = tensor.dims()[axis]; + else + shape[i] = tensor.dims()[point++]; + } + auto zeros = Full(dev_ctx, shape, 0); + auto diags = Transpose( + dev_ctx, tensor, perm_moveto(tensor.dims().size(), axis, -1)); + return FillDiagonalTensor( + dev_ctx, zeros, diags, 0, insert_pos, axis + (insert_pos <= axis)); +} + +template +DenseTensor PerformUndiagonal(const Context& dev_ctx, + const DenseTensor& tensor, + int n_broadcast, + const std::string& equ) { + // if the equ is 'iijjkij', then the tensor must be 'ijk', so we have enough + // information to do un-diagonal with equ. + auto res = tensor; + LabelMap label2perm(-1); + InferLabelPerm(equ, n_broadcast, &label2perm); + // Un-Diagonal + int tot = + equ.size() + n_broadcast + (equ.find(".") != std::string::npos ? -1 : 0); + int cur = tot - 1; + for (auto it = equ.rbegin(); it != equ.rend(); ++it) { + char c = *it; + if (c == '.') { + cur -= n_broadcast; + } else { + if (cur != label2perm[c]) { + // do diagonal, followed by movedim(). + auto insert_pos = cur - tot + res.dims().size() + 1; + res = Undiagonal(dev_ctx, res, insert_pos, label2perm[c]); + } + --cur; + } + } + return res; +} + +template +DenseTensor PerformDiagonalAndReduction(const Context& dev_ctx, + const DenseTensor& tensor, + const std::string& equ, + const LabelMap& label2perm, + const std::vector& all_labels, + const std::vector& ellipsis, + const LabelMap& label2type) { + auto res = tensor; + // Diagonal + int tot = equ.size() + ellipsis.size() + + (equ.find(".") != std::string::npos ? -1 : 0); + int cur = tot - 1; + for (auto it = equ.rbegin(); it != equ.rend(); ++it) { + char c = *it; + if (c == '.') { + cur -= ellipsis.size(); + } else { + if (cur != label2perm[c]) { + // do diagonal, followed by movedim(). + VLOG(5) << "Do diagonal with shape=" + << paddle::string::join_strings(vectorize(res.dims()), ',') + << ", axis1=" << cur << ", axis2=" << label2perm[c]; + res = Diagonal(dev_ctx, res, 0, cur, label2perm[c]); + res = Transpose( + dev_ctx, res, perm_moveto(res.dims().size(), -1, label2perm[c])); + } + --cur; + } + } + // reduction auto indices = GetLabelIndexByType( all_labels, label2type, label2perm, ellipsis, LabelType::Reduction); - VLOG(5) << "call PerformReduction: with axis: " + VLOG(5) << "call PerformDiagonalAndReduction: with axis: " << paddle::string::join_strings(indices, ","); - if (indices.size() == 0) return tensor; + if (indices.size() == 0) return res; return Sum( - dev_ctx, tensor, phi::IntArray(indices), tensor.dtype(), true); + dev_ctx, res, phi::IntArray(indices), res.dtype(), true); } inline bool is_no_need_transpose(const std::vector& axis) { @@ -415,8 +540,8 @@ DenseTensor PerformTranspose(const Context& dev_ctx, template DenseTensor PerformContraction( const Context& dev_ctx, - const DenseTensor& A, - const DenseTensor& B, + const std::vector& operands, + const std::vector& input_strs, const std::vector& label2perm, const std::vector& all_labels, const LabelMap& label2type, @@ -467,8 +592,14 @@ DenseTensor PerformContraction( trans_t.ShareBufferWith(*(cache[operand_idx])); VLOG(5) << "Cache Used!"; } else { - auto reduct_t = PerformReduction( - dev_ctx, t, perm, all_labels, ellipsis, label2type); + auto reduct_t = + PerformDiagonalAndReduction(dev_ctx, + t, + input_strs[operand_idx], + perm, + all_labels, + ellipsis, + label2type); trans_t = PerformTranspose( dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type); if (cache[operand_idx] != nullptr) @@ -499,10 +630,19 @@ DenseTensor PerformContraction( }; // Reduction, Reshape and Matmul - auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0], 0); - auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1], 1); - auto after_contraction = - Matmul(dev_ctx, trans_a, trans_b, false, false); + DenseTensor after_contraction; + if (operands.size() == 2) { + auto trans_a = + preprocess(*(operands[0]), label2perm[0], ellipsis_dims[0], 0); + auto trans_b = + preprocess(*(operands[1]), label2perm[1], ellipsis_dims[1], 1); + after_contraction = + Matmul(dev_ctx, trans_a, trans_b, false, false); + } else if (operands.size() == 1) { + after_contraction = + preprocess(*(operands[0]), label2perm[0], ellipsis_dims[0], 0); + } + if (recover_dim.size() == 0) recover_dim.push_back(1); VLOG(5) << "PerformContraction: recover_dim: " << paddle::string::join_strings(recover_dim, ","); after_contraction.Resize(make_ddim(recover_dim)); @@ -510,12 +650,11 @@ DenseTensor PerformContraction( } template -void TransposeToOutput(const Context& dev_ctx, - const DenseTensor& to_trans, - const std::string& right, - const std::vector& all_labels, - int n_broadcast_dims, - DenseTensor* output) { +DenseTensor TransposeToOutput(const Context& dev_ctx, + const DenseTensor& to_trans, + const std::vector& right, + const std::vector& all_labels, + int n_broadcast_dims) { std::vector axis; int offset = 0; if (std::find(all_labels.begin(), all_labels.end(), '.') != @@ -534,12 +673,11 @@ void TransposeToOutput(const Context& dev_ctx, } } if (is_no_need_transpose(axis)) { - output->ShareBufferWith(to_trans); - return; + return to_trans; } VLOG(5) << "call TransposeToOutput: with axis: " << paddle::string::join_strings(axis, ","); - TransposeKernel(dev_ctx, to_trans, axis, output); + return Transpose(dev_ctx, to_trans, axis); } template @@ -550,6 +688,7 @@ void EinsumKernelImpl(const Context& dev_ctx, DenseTensor* out, std::vector cache, bool is_forward = true) { + VLOG(5) << "Start EinsumKernelImpl"; ValidationCheck(equation); // collect the following informations to prepare einsum. LabelMap labelshape(0); @@ -564,6 +703,7 @@ void EinsumKernelImpl(const Context& dev_ctx, for (auto& i : inputs) { input_dims.push_back(i->dims()); } + std::vector input_strs; std::string right; if (!is_forward) { all_labels = forward_all_labels; @@ -577,57 +717,32 @@ void EinsumKernelImpl(const Context& dev_ctx, &ellipsis_dims, &broadcast_dims, &output_dims, - &right); - out->Resize(make_ddim(output_dims)); - if (inputs.size() == 2) { - auto& A = inputs[0]; - auto& B = inputs[1]; - // Reduction and Contract Procedure - auto after_contraction = PerformContraction(dev_ctx, - *A, - *B, - label2perms, - all_labels, - labeltype, - labelshape, - ellipsis_dims, - broadcast_dims, - cache, - !is_forward); - TransposeToOutput(dev_ctx, - after_contraction, - right, - all_labels, - broadcast_dims.size(), - out); - // Reshape Procedure - } else if (inputs.size() == 1) { - if (cache[0] != nullptr) { // For compatibility, may be cache is nullptr if - // loading the program from v2.3.0 - (*cache[0]) = *(inputs[0]); // ShareBuffer for backward, because backward - // we can only see cached tensor. - } - auto reduce_A = PerformReduction(dev_ctx, - *inputs[0], - label2perms[0], - all_labels, - ellipsis_dims[0], - labeltype); - std::vector right_labels; - for (auto c : right) right_labels.push_back(c); - right_labels = union_labels(right_labels, all_labels); - *out = PerformTranspose(dev_ctx, - reduce_A, - label2perms[0], - right_labels, - broadcast_dims, - labeltype); - out->Resize(make_ddim(output_dims)); - } else { + &right, + &input_strs); + if (inputs.size() > 2) { PADDLE_THROW(phi::errors::InvalidArgument( "EinsumOp kernel only support len(operands) between (0, 2]. Use " "opt_einsum first to convert multi-variable to binary-variable.")); } + auto after_contraction = PerformContraction(dev_ctx, + inputs, + input_strs, + label2perms, + all_labels, + labeltype, + labelshape, + ellipsis_dims, + broadcast_dims, + cache, + !is_forward); + *out = TransposeToOutput(dev_ctx, + after_contraction, + unique_labels(right), + all_labels, + broadcast_dims.size()); + *out = PerformUndiagonal( + dev_ctx, *out, broadcast_dims.size(), right); + out->Resize(make_ddim(output_dims)); } template diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py index 9db367a23357f..bb48cd31dd481 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_op.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py @@ -154,5 +154,54 @@ def set_mandatory(self): self.equation = "i,i->" +class TestEinsumWithDiagonal(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(10, 10)] + self.types = [np.float64] + self.equation = "ii->" + + +class TestEinsumWithDiagonal2(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(10, 3, 10)] + self.types = [np.float64] + self.equation = "iji->j" + + +class TestEinsumWithDiagonal3(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(5, 3, 2, 1, 4, 5)] + self.types = [np.float64] + self.equation = "a...a->..." + + +class TestEinsumWithDiagonal4(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(5, 3, 2, 1, 4, 5)] + self.types = [np.float64] + self.equation = "a...a->a..." + + +class TestEinsumWithDiagonal5(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(8, 8, 8)] + self.types = [np.float64] + self.equation = "aaa->a" + + +class TestEinsumWithDiagonal6(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(3, 5, 7, 3), (5, 7, 5, 7)] + self.types = [np.float64, np.float64] + self.equation = "ijki,jkjk->ik" + + +class TestEinsumWithDiagonal8(TestEinsumBinary): + def set_mandatory(self): + self.shapes = [(3, 5, 7, 3), (5, 7, 5, 7)] + self.types = [np.float64, np.float64] + self.equation = "ijki,jkjk->" + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py index e7b041124c257..c7d2f9c76b250 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py @@ -41,22 +41,6 @@ class TestErrors(unittest.TestCase): def setUp(self): pass - def test_diagonalize_errors(self): - a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') - a = paddle.to_tensor(a) - with self.assertRaisesRegex( - AssertionError, ('Duplicate labels are not supported.') - ): - paddle.einsum('...ii->...i', a) - with self.assertRaisesRegex( - AssertionError, ('Duplicate labels are not supported.') - ): - paddle.einsum('i...i', a) - with self.assertRaisesRegex( - AssertionError, ('Duplicate labels are not supported.') - ): - paddle.einsum('i...i->i...', a) - def test_param_errors(self): a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') a = paddle.to_tensor(a) @@ -126,11 +110,6 @@ def test_param_errors(self): ("Invalid equation: missing ellipsis in output labels."), ): paddle.einsum('i...->i', a) - with self.assertRaisesRegex( - AssertionError, - ("Invalid equation: duplicate output labels are found."), - ): - paddle.einsum('i...->i...i', a) with self.assertRaisesRegex( AssertionError, ( @@ -162,6 +141,13 @@ def setUpClass(cls): "I": np.random.rand(2, 2), "J": np.random.rand(1, 3, 5), "K": np.random.rand(1, 2, 3, 4), + "X": np.random.rand(5, 5), + "L": np.random.rand(5, 10, 5), + "M": np.random.rand(5, 3, 2, 1, 4, 5), + "N": np.random.rand(5, 5, 5), + "O": np.random.rand(3, 5, 7, 3), + "P": np.random.rand(5, 7, 5, 7), + "S": np.random.rand(4, 3, 4, 4), } def _get_place(self, force_to_use_cpu=False): @@ -207,14 +193,54 @@ def test_forward(self): self.check_output_equal(result.numpy(), expected_result) -class TestEinsumVectorDot(TestEinsum): +class TestEinsumTraceDiag1(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ii->", "data": ["X"]} + + +class TestEinsumTraceDiag2(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "iji->j", "data": ["L"]} + + +class TestEinsumTraceDiag3(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "a...a->...", "data": ["M"]} + + +class TestEinsumTraceDiag4(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "a...a->a...", "data": ["M"]} + + +class TestEinsumTraceDiag5(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "aaa->a", "data": ["N"]} + + +# Numpy don't support i->ii, but paddle.einsum support. +# class TestEinsumTraceDiag6(TestEinsum): +# def setUp(self): +# self.sample = {"paradigm": "i->iii", "data": ["x"]} + +# class TestEinsumTraceDiag7(TestEinsum): +# def setUp(self): +# self.sample = {"paradigm": "i...->i...i", "data": ["S"]} + + +class TestEinsumTraceDiag2Ops(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijki,jkjk->ik", "data": ["O", "P"]} + + +class TestEinsumIdentity(TestEinsum): def setUp(self): - self.sample = {"paradigm": "i,i->", "data": ["x", "x"]} + self.sample = {"paradigm": "...->...", "data": ["N"]} -class TestEinsumVectorMul(TestEinsum): +class TestEinsumElementwiseProduct(TestEinsum): def setUp(self): - self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]} + self.sample = {"paradigm": "...,...->...", "data": ["N", "N"]} class TestEinsumVectorOuter(TestEinsum): @@ -436,37 +462,12 @@ def test_sums(self): self.check_output("...,...", a, a) self.check_output("i,i", a, a) - # TODO(@xiongkun): explict broadcast in EinsumOp is not supported, it's not recommend to use einsum like this. - # p = np.ones((10, 2)).astype('float') - # q = np.ones((1, 2)).astype('float') - # self.check_output('ij,ij->j', p, q) - - # TODO(@xiongkun): explict-label-broadcast in EinsumOp is not supported, it's not recommend to use einsum like this. - # x = np.array([2., 3.]).astype('float') - # y = np.array([4.]).astype('float') - # self.check_output("i, i", x, y) - - # TODO(@xiongkun): explict-label-broadcast in EinsumOp is not supported, it's not recommend to use einsum like this. - # p = np.ones((1, 5)) / 2 - # q = np.ones((5, 5)) / 2 - # self.check_output("...ij,...jk->...ik", p, p) - # self.check_output("...ij,...jk->...ik", p, q) - x = np.eye(2).astype('float') y = np.ones(2).astype('float') self.check_output("ji,i->", x, y) self.check_output("i,ij->", y, x) self.check_output("ij,i->", x, y) - def test_large_nops(self): - pass - # TODO(@xiongkun): explict broadcast in EinsumOp is not supported, it's not recommend to use einsum like this. - # a = np.arange(4 * 3 * 1 * 4).reshape(4, 3, 1, 4).astype('float') - # self.check_output('a...b,b...c,c...d', a, a, a) - # self.check_output('a...b,b...c,c...a', a, a, a) - # self.check_output('a...b,b...c,c...a', a, a, a) - # self.check_output('...ab,...ba,...ab,...ab', a, a, a, a) - def test_static_graph(self): paddle.enable_static() fluid = paddle.fluid @@ -569,5 +570,32 @@ def test_shape(self): c = paddle.einsum('xy,yz->xz', a, b) +class TestSimpleUndiagonal(unittest.TestCase): + """ + EinsumOp support undiagonalize. + """ + + def test_shape(self): + paddle.disable_static() + A = paddle.to_tensor(np.array([1.0, 2.0])) + A_expect = paddle.to_tensor([[1.0, 0.0], [0.0, 2.0]]) + A_actual = paddle.einsum('i->ii', A) + np.array_equal(A_expect.numpy(), A_actual.numpy()) + + +class TestSimpleUndiagonal2(unittest.TestCase): + """ + EinsumOp support undiagonalize. + """ + + def test_shape(self): + paddle.disable_static() + A = paddle.to_tensor(np.array([1.0, 2.0])) + B = paddle.to_tensor(np.array([1.0, 1.0])) + A_expect = paddle.to_tensor([[2.0, 0.0], [0.0, 4.0]]) + A_actual = paddle.einsum('i,j->ii', A, B) + np.array_equal(A_expect.numpy(), A_actual.numpy()) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 5c792f8fe0df8..19a63d515bea7 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -727,14 +727,6 @@ def preprocess(equation, *operands): '...' in lhs and '...' not in rhs ), 'Invalid equation: missing ellipsis in output labels.' - assert not ( - len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0 - ), 'Duplicate labels are not supported.' - - assert not has_duplicated_labels( - rhs - ), 'Invalid equation: duplicate output labels are found.' - return lhs, rhs, labels From 60e0c506a55820084481890d6266cca9e822af83 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 31 Oct 2022 18:37:56 +0800 Subject: [PATCH 45/91] [PHI]Standardise some C++ API (#47385) * standard api * fix ci bugs * fix ci bugs * fix ce bugs --- paddle/fluid/operators/activation_op.h | 6 +- paddle/fluid/operators/activation_op.kps | 10 +- paddle/fluid/operators/crop_tensor_op.cc | 4 +- paddle/fluid/operators/gaussian_random_op.cc | 2 +- paddle/fluid/operators/graph_send_recv_op.cc | 2 +- .../fluid/operators/graph_send_ue_recv_op.cc | 2 +- paddle/phi/api/yaml/backward.yaml | 22 +- paddle/phi/api/yaml/legacy_backward.yaml | 84 ++--- paddle/phi/api/yaml/legacy_ops.yaml | 82 ++--- paddle/phi/api/yaml/op_compat.yaml | 3 + paddle/phi/api/yaml/ops.yaml | 20 +- paddle/phi/api/yaml/sparse_backward.yaml | 2 +- paddle/phi/api/yaml/sparse_ops.yaml | 70 ++-- paddle/phi/core/compat/op_utils.h | 4 +- paddle/phi/infermeta/backward.cc | 8 +- paddle/phi/infermeta/backward.h | 8 +- paddle/phi/infermeta/multiary.cc | 316 +++++++++--------- paddle/phi/infermeta/multiary.h | 34 +- paddle/phi/infermeta/nullary.cc | 12 +- paddle/phi/infermeta/nullary.h | 12 +- paddle/phi/infermeta/ternary.cc | 16 +- paddle/phi/infermeta/ternary.h | 16 +- paddle/phi/infermeta/unary.cc | 10 +- paddle/phi/infermeta/unary.h | 10 +- paddle/phi/kernels/activation_grad_kernel.h | 2 +- paddle/phi/kernels/activation_kernel.h | 2 +- paddle/phi/kernels/addmm_kernel.h | 2 +- .../phi/kernels/cpu/activation_grad_kernel.cc | 6 +- paddle/phi/kernels/cpu/activation_kernel.cc | 4 +- ...p_tensor_kernel.cc => crop_grad_kernel.cc} | 8 +- ...p_tensor_grad_kernel.cc => crop_kernel.cc} | 14 +- ...an_random_kernel.cc => gaussian_kernel.cc} | 24 +- ...d_kernel.cc => send_u_recv_grad_kernel.cc} | 24 +- ...d_recv_kernel.cc => send_u_recv_kernel.cc} | 22 +- ..._kernel.cc => send_ue_recv_grad_kernel.cc} | 30 +- ..._recv_kernel.cc => send_ue_recv_kernel.cc} | 26 +- ..._grad_kernel.cc => send_uv_grad_kernel.cc} | 24 +- ...ph_send_uv_kernel.cc => send_uv_kernel.cc} | 26 +- ...rop_tensor_kernel.h => crop_grad_kernel.h} | 10 +- ...rop_tensor_grad_kernel.h => crop_kernel.h} | 10 +- paddle/phi/kernels/funcs/activation_functor.h | 8 +- ...sian_random_kernel.h => gaussian_kernel.h} | 14 +- .../phi/kernels/gpu/activation_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/activation_kernel.cu | 7 +- ...p_tensor_kernel.cu => crop_grad_kernel.cu} | 8 +- ...p_tensor_grad_kernel.cu => crop_kernel.cu} | 14 +- ...an_random_kernel.cu => gaussian_kernel.cu} | 20 +- .../phi/kernels/gpu/graph_send_recv_funcs.h | 2 +- ...d_kernel.cu => send_u_recv_grad_kernel.cu} | 24 +- ...d_recv_kernel.cu => send_u_recv_kernel.cu} | 22 +- ..._kernel.cu => send_ue_recv_grad_kernel.cu} | 30 +- ..._recv_kernel.cu => send_ue_recv_kernel.cu} | 32 +- ..._grad_kernel.cu => send_uv_grad_kernel.cu} | 24 +- ...ph_send_uv_kernel.cu => send_uv_kernel.cu} | 24 +- .../kernels/graph_send_ue_recv_grad_kernel.h | 36 -- .../phi/kernels/graph_send_ue_recv_kernel.h | 35 -- .../phi/kernels/graph_send_uv_grad_kernel.h | 33 -- paddle/phi/kernels/impl/addmm_kernel_impl.h | 2 +- ..._kernel_impl.h => crop_grad_kernel_impl.h} | 12 +- ...ensor_kernel_impl.h => crop_kernel_impl.h} | 12 +- ...an_random_kernel.cc => gaussian_kernel.cc} | 19 +- ...ecv_kernel.h => send_u_recv_grad_kernel.h} | 12 +- paddle/phi/kernels/send_u_recv_kernel.h | 34 ++ ...ad_kernel.h => send_ue_recv_grad_kernel.h} | 22 +- paddle/phi/kernels/send_ue_recv_kernel.h | 35 ++ paddle/phi/kernels/send_uv_grad_kernel.h | 33 ++ ...raph_send_uv_kernel.h => send_uv_kernel.h} | 14 +- paddle/phi/kernels/sparse/addmm_kernel.h | 8 +- paddle/phi/kernels/sparse/cpu/addmm_kernel.cc | 4 +- paddle/phi/kernels/sparse/gpu/addmm_kernel.cu | 10 +- ...an_random_kernel.cc => gaussian_kernel.cc} | 19 +- paddle/phi/ops/compat/activation_sig.cc | 6 +- paddle/phi/ops/compat/addmm_sig.cc | 6 + paddle/phi/ops/compat/crop_tensor_sig.cc | 31 +- paddle/phi/ops/compat/gaussian_random_sig.cc | 18 +- paddle/phi/ops/compat/graph_send_recv_sig.cc | 9 +- .../phi/ops/compat/graph_send_ue_recv_sig.cc | 9 +- python/paddle/fluid/initializer.py | 6 +- python/paddle/fluid/layers/nn.py | 2 +- .../geometric/message_passing/send_recv.py | 10 +- .../incubate/operators/graph_send_recv.py | 2 +- python/paddle/nn/functional/activation.py | 2 +- python/paddle/nn/initializer/orthogonal.py | 2 +- python/paddle/sparse/multiary.py | 2 +- python/paddle/tensor/manipulation.py | 2 +- python/paddle/tensor/math.py | 2 +- python/paddle/tensor/random.py | 2 +- 87 files changed, 838 insertions(+), 837 deletions(-) rename paddle/phi/kernels/cpu/{crop_tensor_kernel.cc => crop_grad_kernel.cc} (82%) rename paddle/phi/kernels/cpu/{crop_tensor_grad_kernel.cc => crop_kernel.cc} (66%) rename paddle/phi/kernels/cpu/{gaussian_random_kernel.cc => gaussian_kernel.cc} (68%) rename paddle/phi/kernels/cpu/{graph_send_recv_grad_kernel.cc => send_u_recv_grad_kernel.cc} (89%) rename paddle/phi/kernels/cpu/{graph_send_recv_kernel.cc => send_u_recv_kernel.cc} (93%) rename paddle/phi/kernels/cpu/{graph_send_ue_recv_grad_kernel.cc => send_ue_recv_grad_kernel.cc} (95%) rename paddle/phi/kernels/cpu/{graph_send_ue_recv_kernel.cc => send_ue_recv_kernel.cc} (94%) rename paddle/phi/kernels/cpu/{graph_send_uv_grad_kernel.cc => send_uv_grad_kernel.cc} (94%) rename paddle/phi/kernels/cpu/{graph_send_uv_kernel.cc => send_uv_kernel.cc} (88%) rename paddle/phi/kernels/{crop_tensor_kernel.h => crop_grad_kernel.h} (78%) rename paddle/phi/kernels/{crop_tensor_grad_kernel.h => crop_kernel.h} (76%) rename paddle/phi/kernels/{gaussian_random_kernel.h => gaussian_kernel.h} (74%) rename paddle/phi/kernels/gpu/{crop_tensor_kernel.cu => crop_grad_kernel.cu} (82%) rename paddle/phi/kernels/gpu/{crop_tensor_grad_kernel.cu => crop_kernel.cu} (66%) rename paddle/phi/kernels/gpu/{gaussian_random_kernel.cu => gaussian_kernel.cu} (86%) rename paddle/phi/kernels/gpu/{graph_send_recv_grad_kernel.cu => send_u_recv_grad_kernel.cu} (86%) rename paddle/phi/kernels/gpu/{graph_send_recv_kernel.cu => send_u_recv_kernel.cu} (93%) rename paddle/phi/kernels/gpu/{graph_send_ue_recv_grad_kernel.cu => send_ue_recv_grad_kernel.cu} (96%) rename paddle/phi/kernels/gpu/{graph_send_ue_recv_kernel.cu => send_ue_recv_kernel.cu} (94%) rename paddle/phi/kernels/gpu/{graph_send_uv_grad_kernel.cu => send_uv_grad_kernel.cu} (95%) rename paddle/phi/kernels/gpu/{graph_send_uv_kernel.cu => send_uv_kernel.cu} (92%) delete mode 100644 paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h delete mode 100644 paddle/phi/kernels/graph_send_ue_recv_kernel.h delete mode 100644 paddle/phi/kernels/graph_send_uv_grad_kernel.h rename paddle/phi/kernels/impl/{crop_tensor_grad_kernel_impl.h => crop_grad_kernel_impl.h} (91%) rename paddle/phi/kernels/impl/{crop_tensor_kernel_impl.h => crop_kernel_impl.h} (95%) rename paddle/phi/kernels/onednn/{gaussian_random_kernel.cc => gaussian_kernel.cc} (76%) rename paddle/phi/kernels/{graph_send_recv_kernel.h => send_u_recv_grad_kernel.h} (75%) create mode 100644 paddle/phi/kernels/send_u_recv_kernel.h rename paddle/phi/kernels/{graph_send_recv_grad_kernel.h => send_ue_recv_grad_kernel.h} (55%) create mode 100644 paddle/phi/kernels/send_ue_recv_kernel.h create mode 100644 paddle/phi/kernels/send_uv_grad_kernel.h rename paddle/phi/kernels/{graph_send_uv_kernel.h => send_uv_kernel.h} (69%) rename paddle/phi/kernels/xpu/{gaussian_random_kernel.cc => gaussian_kernel.cc} (78%) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 833015b803dd1..94d5fcca508f9 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -261,6 +261,11 @@ struct BaseActivationFunctor { template \ using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor; +template +using BReluFunctor = phi::funcs::HardTanhFunctor; +template +using BReluGradFunctor = phi::funcs::HardTanhGradFunctor; + USE_PHI_FUNCTOR(Cos) USE_PHI_FUNCTOR(Tan) USE_PHI_FUNCTOR(Acos) @@ -275,7 +280,6 @@ USE_PHI_FUNCTOR(Atanh) USE_PHI_FUNCTOR(Tanh) USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) -USE_PHI_FUNCTOR(BRelu) USE_PHI_FUNCTOR(ThresholdedRelu) USE_PHI_FUNCTOR(Relu6) USE_PHI_FUNCTOR(LeakyRelu) diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 8cc5e925f7490..0ce55b7cf7331 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -130,6 +130,11 @@ class ActivationGradCudaKernel } }; +template +using CudaBReluFunctor = phi::funcs::CudaHardTanhFunctor; +template +using CudaBReluGradFunctor = phi::funcs::CudaHardTanhGradFunctor; + USE_PHI_FUNCTOR(CudaCos) USE_PHI_FUNCTOR(CudaTan) USE_PHI_FUNCTOR(CudaAcos) @@ -142,7 +147,6 @@ USE_PHI_FUNCTOR(CudaAsinh) USE_PHI_FUNCTOR(CudaAcosh) USE_PHI_FUNCTOR(CudaAtanh) USE_PHI_FUNCTOR(CudaTanh) -USE_PHI_FUNCTOR(CudaBRelu) USE_PHI_FUNCTOR(CudaLeakyRelu) USE_PHI_FUNCTOR(CudaThresholdedRelu) USE_PHI_FUNCTOR(CudaRelu6) @@ -276,13 +280,13 @@ REGISTER_OP_KERNEL( KP, plat::XPUPlace, ops::ActivationCudaKernel>); + phi::funcs::CudaHardTanhFunctor>); REGISTER_OP_KERNEL( brelu_grad, KP, plat::XPUPlace, ops::ActivationGradCudaKernel>); + phi::funcs::CudaHardTanhGradFunctor>); REGISTER_OP_KERNEL(ceil, KP, diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc index 44986baef8120..b74aaf8cb22a2 100644 --- a/paddle/fluid/operators/crop_tensor_op.cc +++ b/paddle/fluid/operators/crop_tensor_op.cc @@ -75,8 +75,8 @@ class CropTensorOp : public framework::OperatorWithKernel { x_dim.size())); if (ctx->IsRuntime()) { // If true, set the shape of Output(Out) according to Input(Shape) in - // CropTensorKernel with ExecutionContext. Also check LoD in - // CropTensorKernel. + // CropKernel with ExecutionContext. Also check LoD in + // CropKernel. ctx->ShareLoD("X", /*->*/ "Out"); } else { auto out_dims = std::vector(shape_dim[0], -1); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index f418e48f7d9c8..b298ce5635e85 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -132,7 +132,7 @@ namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, - PD_INFER_META(phi::GaussianRandomInferMeta)); + PD_INFER_META(phi::GaussianInferMeta)); REGISTER_OPERATOR( gaussian_random, diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index c907ae2b704b8..9e57884c1412c 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -127,7 +127,7 @@ namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor, - PD_INFER_META(phi::GraphSendRecvInferMeta)); + PD_INFER_META(phi::SendURecvInferMeta)); REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvOpMaker, diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc index 6c38ee65e8758..561c7e06f0b37 100644 --- a/paddle/fluid/operators/graph_send_ue_recv_op.cc +++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc @@ -140,7 +140,7 @@ namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv, GraphSendUERecvInferShapeFunctor, - PD_INFER_META(phi::GraphSendUERecvInferMeta)); + PD_INFER_META(phi::SendUERecvInferMeta)); REGISTER_OPERATOR(graph_send_ue_recv, ops::GraphSendUERecvOP, ops::GraphSendUERecvOpMaker, diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index cb51e8fa13f4f..b13bd97a5a6a5 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -295,17 +295,6 @@ output : Tensor(x_grad) invoke : flip(out_grad, axis) -- backward_op : graph_send_uv_grad - forward : graph_send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out) - args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD") - output : Tensor(x_grad), Tensor(y_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, y] - kernel : - func : graph_send_uv_grad - data_type : x - - backward_op : lgamma_grad forward : lgamma(Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -336,6 +325,17 @@ kernel : func : poisson_grad +- backward_op : send_uv_grad + forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out) + args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD") + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : send_uv_grad + data_type : x + - backward_op : sin_grad forward : sin (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 916f5c405d7a7..0eb91e9b51d43 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -59,7 +59,7 @@ inplace : (grad_grad_out_grad -> grad_grad_x_grad) - backward_op : addmm_grad - forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out) + forward : addmm (Tensor input, Tensor x, Tensor y, float beta, float alpha) -> Tensor(out) args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta) output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad) infer_meta : @@ -198,17 +198,6 @@ kernel : func : bilinear_tensor_product_grad -- backward_op : brelu_grad - forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out) - args : (Tensor x, Tensor out_grad, float t_min, float t_max) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : brelu_grad - inplace : (out_grad -> x_grad) - - backward_op : broadcast_tensors_grad forward : broadcast_tensors (Tensor[] input) -> Tensor[](out) args : (Tensor[] input, Tensor[] out_grad) @@ -401,14 +390,14 @@ func : conv3d_transpose_grad use_gpudnn : true -- backward_op : crop_tensor_grad +- backward_op : crop_grad forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out) args : (Tensor x, Tensor out_grad, IntArray offsets) output : Tensor(x_grad) infer_meta : - func : CropTensorGradInferMeta + func : CropGradInferMeta kernel : - func : crop_tensor_grad + func : crop_grad data_type : x - backward_op : cross_entropy_with_softmax_grad @@ -779,30 +768,6 @@ kernel : func : gelu_grad -- backward_op : graph_send_recv_grad - forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM") - output : Tensor(x_grad) - infer_meta : - func : GeneralUnaryGradInferMeta - param : [x] - kernel : - func : graph_send_recv_grad - data_type : out_grad - optional: out, dst_count - -- backward_op : graph_send_ue_recv_grad - forward : graph_send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op) - output : Tensor(x_grad), Tensor(y_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, y] - kernel : - func : graph_send_ue_recv_grad - data_type : out_grad - optional: out, dst_count - - backward_op : grid_sample_grad forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out) args : (Tensor x, Tensor grid, Tensor out_grad, str mode, str padding_mode, bool align_corners) @@ -870,6 +835,17 @@ func : hard_swish_grad inplace : (out_grad -> x_grad) +- backward_op : hardtanh_grad + forward : hardtanh (Tensor x, float t_min, float t_max) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float t_min, float t_max) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_tanh_grad + inplace : (out_grad -> x_grad) + - backward_op : hierarchical_sigmoid_grad forward : hierarchical_sigmoid (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out) args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) @@ -1624,12 +1600,12 @@ # output is optional - backward_op : put_along_axis_grad - forward : put_along_axis (Tensor arr, Tensor index, Tensor value, int axis, str reduce) -> Tensor(out) - args : (Tensor arr, Tensor index, Tensor out_grad, int axis, str reduce) + forward : put_along_axis (Tensor arr, Tensor indices, Tensor value, int axis, str reduce) -> Tensor(out) + args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce) output : Tensor(arr_grad), Tensor(value_grad) infer_meta : func : GeneralBinaryGradInferMeta - param : [arr, index] + param : [arr, indices] kernel : func : put_along_axis_grad @@ -1911,6 +1887,30 @@ kernel : func : selu_grad +- backward_op : send_u_recv_grad + forward : send_u_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM") + output : Tensor(x_grad) + infer_meta : + func : GeneralUnaryGradInferMeta + param : [x] + kernel : + func : send_u_recv_grad + data_type : out_grad + optional: out, dst_count + +- backward_op : send_ue_recv_grad + forward : send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : send_ue_recv_grad + data_type : out_grad + optional: out, dst_count + - backward_op : sigmoid_cross_entropy_with_logits_grad forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out) args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index de290bd169f6e..c711f6bd42710 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -88,7 +88,7 @@ backward : add_n_grad - op : addmm - args : (Tensor input, Tensor x, Tensor y, float alpha, float beta) + args : (Tensor input, Tensor x, Tensor y, float beta, float alpha) output : Tensor infer_meta : func : AddmmInferMeta @@ -346,16 +346,6 @@ func : box_coder optional : prior_box_var -- op : brelu - args : (Tensor x, float t_min, float t_max) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : brelu - backward : brelu_grad - - op : cast args : (Tensor x, DataType dtype) output : Tensor @@ -508,15 +498,15 @@ output : Tensor(out) invoke : copy_to_impl(x, place, blocking) -- op : crop_tensor +- op : crop args : (Tensor x, IntArray shape, IntArray offsets) output : Tensor(out) infer_meta : - func : CropTensorInferMeta + func : CropInferMeta kernel : - func : crop_tensor + func : crop data_type : x - backward : crop_tensor_grad + backward : crop_grad # Part of python API paddle.nn.functional.cross_entropy - op : cross_entropy_with_softmax @@ -979,14 +969,14 @@ kernel : func : gather_tree -- op : gaussian_random +- op : gaussian args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={}) output: Tensor(out) infer_meta : - func : GaussianRandomInferMeta + func : GaussianInferMeta param : [shape, mean, std, seed, dtype] kernel : - func : gaussian_random + func : gaussian param : [shape, mean, std, seed, dtype] data_type : dtype backend : place @@ -1009,28 +999,6 @@ kernel : func : generate_proposals_v2 -- op : graph_send_recv - args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) - output : Tensor(out), Tensor(dst_count) - infer_meta : - func : GraphSendRecvInferMeta - kernel : - func : graph_send_recv - data_type : x - intermediate : dst_count - backward : graph_send_recv_grad - -- op : graph_send_ue_recv - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) - output : Tensor(out), Tensor(dst_count) - infer_meta : - func : GraphSendUERecvInferMeta - kernel : - func : graph_send_ue_recv - data_type : x - intermediate : dst_count - backward : graph_send_ue_recv_grad - - op : greater_equal args : (Tensor x, Tensor y, int axis = -1) output : Tensor(out) @@ -1108,6 +1076,16 @@ func : hard_swish backward : hardswish_grad +- op : hardtanh + args : (Tensor x, float t_min, float t_max) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_tanh + backward : hardtanh_grad + - op : hierarchical_sigmoid args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) output : Tensor(out), Tensor(pre_out), Tensor(w_out) @@ -1958,7 +1936,7 @@ backward : psroi_pool_grad - op : put_along_axis - args : (Tensor arr, Tensor index, Tensor value, int axis, str reduce) + args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce) output : Tensor(out) infer_meta : func : UnchangedInferMeta @@ -2234,6 +2212,28 @@ func : selu backward : selu_grad +- op : send_u_recv + args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) + output : Tensor(out), Tensor(dst_count) + infer_meta : + func : SendURecvInferMeta + kernel : + func : send_u_recv + data_type : x + intermediate : dst_count + backward : send_u_recv_grad + +- op : send_ue_recv + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) + output : Tensor(out), Tensor(dst_count) + infer_meta : + func : SendUERecvInferMeta + kernel : + func : send_ue_recv + data_type : x + intermediate : dst_count + backward : send_ue_recv_grad + - op : sgd_ args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision) output : Tensor(param_out), Tensor(master_param_out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 304027861e3d6..533a9d9dc040b 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -704,6 +704,9 @@ extra : attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false] +- op : send_uv (graph_send_uv) + backward : send_uv_grad (graph_send_uv_grad) + - op : sequence_softmax backward : sequence_softmax_grad extra : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index e61b7490a15f7..39bdde76ca2a3 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -262,16 +262,6 @@ func : flip backward : flip_grad -- op : graph_send_uv - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") - output : Tensor(out) - infer_meta : - func : GraphSendUVInferMeta - kernel : - func : graph_send_uv - data_type : x - backward : graph_send_uv_grad - - op : lgamma args : (Tensor x) output : Tensor(out) @@ -299,6 +289,16 @@ func : poisson backward : poisson_grad +- op : send_uv + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") + output : Tensor(out) + infer_meta : + func : SendUVInferMeta + kernel : + func : send_uv + data_type : x + backward : send_uv_grad + - op : sin args : (Tensor x) output : Tensor diff --git a/paddle/phi/api/yaml/sparse_backward.yaml b/paddle/phi/api/yaml/sparse_backward.yaml index ffb5406436faa..72c4cc61eea45 100644 --- a/paddle/phi/api/yaml/sparse_backward.yaml +++ b/paddle/phi/api/yaml/sparse_backward.yaml @@ -44,7 +44,7 @@ add_coo_dense_grad{sparse_coo, dense, sparse_coo -> sparse_coo, dense} - backward_op : addmm_grad - forward : addmm(Tensor input, Tensor x, Tensor y, float alpha=1.0, float beta=1.0) -> Tensor(out) + forward : addmm(Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0) -> Tensor(out) args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha=1.0, float beta=1.0) output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad) infer_meta : diff --git a/paddle/phi/api/yaml/sparse_ops.yaml b/paddle/phi/api/yaml/sparse_ops.yaml index a7b0d7a323bf9..12965ce9fbe2c 100644 --- a/paddle/phi/api/yaml/sparse_ops.yaml +++ b/paddle/phi/api/yaml/sparse_ops.yaml @@ -224,6 +224,17 @@ layout : x backward : relu6_grad +- op : reshape + args : (Tensor x, IntArray shape) + output : Tensor(out) + infer_meta : + func : ReshapeInferMeta + kernel : + func : reshape_coo{sparse_coo -> sparse_coo}, + reshape_csr{sparse_csr -> sparse_csr} + layout : x + backward : reshape_grad + - op : scale args : (Tensor x, float scale, float bias, bool bias_after_scale) output : Tensor(out) @@ -312,6 +323,17 @@ layout : x backward : subtract_grad +- op : sync_batch_norm_ + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + infer_meta : + func : BatchNormInferMeta + kernel : + func : sync_batch_norm_coo{sparse_coo, dense, dense, dense, dense -> sparse_coo, dense, dense, dense, dense, dense} + data_type : x + backward : sync_batch_norm_grad + inplace : (mean -> mean_out), (variance -> variance_out) + - op : tan args : (Tensor x) output : Tensor(out) @@ -364,6 +386,18 @@ func : dense_to_csr {dense -> sparse_csr}, coo_to_csr {sparse_coo -> sparse_csr} +- op : transpose + args : (Tensor x, int[] perm) + output : Tensor(out) + infer_meta : + func : TransposeInferMeta + param: [ x, perm ] + kernel : + func : transpose_coo{sparse_coo -> sparse_coo}, + transpose_csr{sparse_csr -> sparse_csr} + layout : x + backward : transpose_grad + - op : values args : (Tensor x) output : Tensor(out) @@ -376,7 +410,7 @@ backward : values_grad - op: addmm - args : (Tensor input, Tensor x, Tensor y, float alpha=1.0, float beta=1.0) + args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0) output : Tensor(out) infer_meta : func : UnchangedInferMeta @@ -469,37 +503,3 @@ mv_csr{sparse_csr, dense -> dense} layout : x backward: mv_grad - -- op : transpose - args : (Tensor x, int[] perm) - output : Tensor(out) - infer_meta : - func : TransposeInferMeta - param: [ x, perm ] - kernel : - func : transpose_coo{sparse_coo -> sparse_coo}, - transpose_csr{sparse_csr -> sparse_csr} - layout : x - backward : transpose_grad - -- op : sync_batch_norm_ - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) - output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) - infer_meta : - func : BatchNormInferMeta - kernel : - func : sync_batch_norm_coo{sparse_coo, dense, dense, dense, dense -> sparse_coo, dense, dense, dense, dense, dense} - data_type : x - backward : sync_batch_norm_grad - inplace : (mean -> mean_out), (variance -> variance_out) - -- op : reshape - args : (Tensor x, IntArray shape) - output : Tensor(out) - infer_meta : - func : ReshapeInferMeta - kernel : - func : reshape_coo{sparse_coo -> sparse_coo}, - reshape_csr{sparse_csr -> sparse_csr} - layout : x - backward : reshape_grad diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 10b859fdac260..9f62bffb7ecd1 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -81,7 +81,9 @@ static const std::unordered_set deprecated_op_names( "nearest_interp", "nearest_interp_grad", "bicubic_interp", - "bicubic_interp_grad"}); + "bicubic_interp_grad", + "crop", + "crop_grad"}); class DefaultKernelSignatureMap { public: diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index fd179a754a20e..6f8a60c9232fa 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -186,10 +186,10 @@ void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x, } } -void CropTensorGradInferMeta(const MetaTensor& out_grad, - const MetaTensor& x, - const IntArray& offsets, - MetaTensor* x_grad) { +void CropGradInferMeta(const MetaTensor& out_grad, + const MetaTensor& x, + const IntArray& offsets, + MetaTensor* x_grad) { auto x_dims = x.dims(); if (x_grad != nullptr) { diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 38372af4b306c..dd86055978a99 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -107,10 +107,10 @@ void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x, MetaTensor* dfilter, MetaTensor* ddout); -void CropTensorGradInferMeta(const MetaTensor& out_grad, - const MetaTensor& x, - const IntArray& offsets, - MetaTensor* x_grad); +void CropGradInferMeta(const MetaTensor& out_grad, + const MetaTensor& x, + const IntArray& offsets, + MetaTensor* x_grad); void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label, const MetaTensor& softmax, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 50eaa05bc625d..91d2642139a2c 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2455,6 +2455,164 @@ void SgdInferMeta(const MetaTensor& param, param_out->set_dtype(param.dtype()); } +void SendUERecvInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& message_op, + const std::string& reduce_op, + const IntArray& out_size, + MetaTensor* out, + MetaTensor* dst_count) { + auto src_index_dims = src_index.dims(); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Src_index should be 1 when it " + "is 2D, but we get %d", + src_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + src_index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); + } + + auto dst_index_dims = dst_index.dims(); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Dst_index should be 1 when it " + "is 2D, but we get %d", + dst_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dst_index_dims.size(), + 1, + phi::errors::InvalidArgument("The Dst_index should be 1D, " + "when it is not 2D, but we get %d", + dst_index_dims.size())); + } + + PADDLE_ENFORCE_EQ(src_index_dims[0], + dst_index_dims[0], + phi::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + + auto y_dims = y.dims(); + PADDLE_ENFORCE_EQ( + y_dims[0], + src_index_dims[0], + phi::errors::InvalidArgument( + "Expect Input Y to have size %d as Src_index on the first dimension, " + "but we get %d", + src_index_dims[0], + y_dims[0])); + + auto x_dims = x.dims(); + if (reduce_op == "MEAN") { + dst_count->set_dims({-1}); + dst_count->set_dtype(DataType::INT32); + } + + // Infer out's shape according to x and e(need broadcasting condition) + out->set_dtype(x.dtype()); + auto x_dims1 = phi::vectorize(x_dims); + auto y_dims1 = phi::vectorize(y_dims); + std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); + std::vector y_dims2(y_dims1.begin() + 1, y_dims1.end()); + + int max_dim = std::max(x_dims2.size(), y_dims2.size()); + int axis = std::abs(static_cast(x_dims2.size() - y_dims2.size())); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + // Only need to broadcast dimensions other than the 0th dimension. + phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), + phi::make_ddim(y_dims2), + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + out_dims_array.insert(out_dims_array.begin(), -1); + out->set_dims(phi::make_ddim(out_dims_array)); +} + +void SendUVInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& message_op, + MetaTensor* out) { + auto src_index_dims = src_index.dims(); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Src_index should be 1 when it " + "is 2D, but we get %d", + src_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + src_index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); + } + + auto dst_index_dims = dst_index.dims(); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Dst_index should be 1 when it " + "is 2D, but we get %d", + dst_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dst_index_dims.size(), + 1, + phi::errors::InvalidArgument("The Dst_index should be 1D, " + "when it is not 2D, but we get %d", + dst_index_dims.size())); + } + + PADDLE_ENFORCE_EQ(src_index_dims[0], + dst_index_dims[0], + phi::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + + // Infer out's shape according to x and y(need broadcasting condition) + out->set_dtype(x.dtype()); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto x_dims1 = phi::vectorize(x_dims); + auto y_dims1 = phi::vectorize(y_dims); + std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); + std::vector y_dims2(y_dims1.begin() + 1, y_dims1.end()); + int max_dim = std::max(x_dims2.size(), y_dims2.size()); + int axis = std::abs(static_cast(x_dims2.size() - y_dims2.size())); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + // Only need to broadcast dimensions other than the 0th dimension. + phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), + phi::make_ddim(y_dims2), + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + out_dims_array.insert(out_dims_array.begin(), src_index_dims[0]); + out->set_dims(phi::make_ddim(out_dims_array)); +} + void StackInferMeta(const std::vector& x, int axis, MetaTensor* out, @@ -2751,164 +2909,6 @@ void Yolov3LossInferMeta(const MetaTensor& x, gt_match_mask->set_dtype(x.dtype()); } -void GraphSendUERecvInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& message_op, - const std::string& reduce_op, - const IntArray& out_size, - MetaTensor* out, - MetaTensor* dst_count) { - auto src_index_dims = src_index.dims(); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), - 1, - phi::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = dst_index.dims(); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), - 1, - phi::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ(src_index_dims[0], - dst_index_dims[0], - phi::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto y_dims = y.dims(); - PADDLE_ENFORCE_EQ( - y_dims[0], - src_index_dims[0], - phi::errors::InvalidArgument( - "Expect Input Y to have size %d as Src_index on the first dimension, " - "but we get %d", - src_index_dims[0], - y_dims[0])); - - auto x_dims = x.dims(); - if (reduce_op == "MEAN") { - dst_count->set_dims({-1}); - dst_count->set_dtype(DataType::INT32); - } - - // Infer out's shape according to x and e(need broadcasting condition) - out->set_dtype(x.dtype()); - auto x_dims1 = phi::vectorize(x_dims); - auto y_dims1 = phi::vectorize(y_dims); - std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); - std::vector y_dims2(y_dims1.begin() + 1, y_dims1.end()); - - int max_dim = std::max(x_dims2.size(), y_dims2.size()); - int axis = std::abs(static_cast(x_dims2.size() - y_dims2.size())); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - // Only need to broadcast dimensions other than the 0th dimension. - phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), - phi::make_ddim(y_dims2), - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - out_dims_array.insert(out_dims_array.begin(), -1); - out->set_dims(phi::make_ddim(out_dims_array)); -} - -void GraphSendUVInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& message_op, - MetaTensor* out) { - auto src_index_dims = src_index.dims(); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), - 1, - phi::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = dst_index.dims(); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), - 1, - phi::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ(src_index_dims[0], - dst_index_dims[0], - phi::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - // Infer out's shape according to x and y(need broadcasting condition) - out->set_dtype(x.dtype()); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - auto x_dims1 = phi::vectorize(x_dims); - auto y_dims1 = phi::vectorize(y_dims); - std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); - std::vector y_dims2(y_dims1.begin() + 1, y_dims1.end()); - int max_dim = std::max(x_dims2.size(), y_dims2.size()); - int axis = std::abs(static_cast(x_dims2.size() - y_dims2.size())); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - // Only need to broadcast dimensions other than the 0th dimension. - phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), - phi::make_ddim(y_dims2), - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - out_dims_array.insert(out_dims_array.begin(), src_index_dims[0]); - out->set_dims(phi::make_ddim(out_dims_array)); -} - } // namespace phi PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 6a8b5511a6c4e..0dfb1307c02a8 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -451,6 +451,23 @@ void RnnInferMeta(const MetaTensor& x, std::vector state, MetaTensor* reserve); +void SendUERecvInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& message_op, + const std::string& reduce_op, + const IntArray& out_size, + MetaTensor* out, + MetaTensor* dst_count); + +void SendUVInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& message_op, + MetaTensor* out); + void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, @@ -506,21 +523,4 @@ void Yolov3LossInferMeta(const MetaTensor& x, MetaTensor* objectness_mask, MetaTensor* gt_match_mask); -void GraphSendUERecvInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& message_op, - const std::string& reduce_op, - const IntArray& out_size, - MetaTensor* out, - MetaTensor* dst_count); - -void GraphSendUVInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& message_op, - MetaTensor* out); - } // namespace phi diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index f9c432c2e0a79..fe83d869ce138 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -73,12 +73,12 @@ void EyeInferMeta(const Scalar& num_rows, out->set_dtype(dtype); } -void GaussianRandomInferMeta(const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - MetaTensor* out) { +void GaussianInferMeta(const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { auto out_dims = phi::make_ddim(shape.GetData()); out->set_dims(out_dims); out->set_dtype(dtype); diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index 27c89821a319e..64522cd03e682 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -48,12 +48,12 @@ void EyeInferMeta(const Scalar& num_rows, MetaTensor* out, MetaConfig config = MetaConfig()); -void GaussianRandomInferMeta(const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - MetaTensor* out); +void GaussianInferMeta(const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); void RandpermInferMeta(int n, DataType dtype, MetaTensor* out); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 6ba7011c9645d..874432aedd573 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -78,8 +78,8 @@ void AccuracyInferMeta(const MetaTensor& out, void AddmmInferMeta(const MetaTensor& input, const MetaTensor& x, const MetaTensor& y, - float alpha, float beta, + float alpha, MetaTensor* out) { auto input_dims = input.dims(); auto x_dims = x.dims(); @@ -402,13 +402,13 @@ void InstanceNormInferMeta(const MetaTensor& x, } } -void GraphSendRecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& reduce_op, - const IntArray& out_size, - MetaTensor* out, - MetaTensor* dst_count) { +void SendURecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + MetaTensor* out, + MetaTensor* dst_count) { auto src_index_dims = src_index.dims(); if (src_index_dims.size() == 2) { PADDLE_ENFORCE_EQ(src_index_dims[1], diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 5314b8f45affe..e0b1573e16679 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -44,8 +44,8 @@ void AccuracyInferMeta(const MetaTensor& out, void AddmmInferMeta(const MetaTensor& input, const MetaTensor& x, const MetaTensor& y, - float alpha, float beta, + float alpha, MetaTensor* out); void ArangeInferMeta(const MetaTensor& start, @@ -72,13 +72,13 @@ void InstanceNormInferMeta(const MetaTensor& x, MetaTensor* saved_variance, MetaConfig config = MetaConfig()); -void GraphSendRecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& reduce_op, - const IntArray& out_size, - MetaTensor* out, - MetaTensor* dst_count); +void SendURecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + MetaTensor* out, + MetaTensor* dst_count); void GroupNormInferMeta(const MetaTensor& x, const MetaTensor& scale, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 1e4c226a9a976..d83477de96181 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -436,11 +436,11 @@ void CumScalarAxisInferMeta(const MetaTensor& x, CumInferMeta(x, axis.to(), flatten, exclusive, reverse, out); } -void CropTensorInferMeta(const MetaTensor& x, - const IntArray& shape, - const IntArray& offsets, - MetaTensor* out, - MetaConfig config) { +void CropInferMeta(const MetaTensor& x, + const IntArray& shape, + const IntArray& offsets, + MetaTensor* out, + MetaConfig config) { PADDLE_ENFORCE_NE( out, nullptr, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 3273a043735a5..66f72a681e2a0 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -82,11 +82,11 @@ void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out); void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out); -void CropTensorInferMeta(const MetaTensor& x, - const IntArray& shape, - const IntArray& offsets, - MetaTensor* out, - MetaConfig config = MetaConfig()); +void CropInferMeta(const MetaTensor& x, + const IntArray& shape, + const IntArray& offsets, + MetaTensor* out, + MetaConfig config = MetaConfig()); void CumInferMeta(const MetaTensor& x, int axis, diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index 46a9830882f6f..d1f6c1c4e573e 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -243,7 +243,7 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6, threshold); -DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, beta, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset); diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 8a83226b23027..15545b931da05 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -85,7 +85,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Logit, eps) -DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardTanh, t_min, t_max) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) diff --git a/paddle/phi/kernels/addmm_kernel.h b/paddle/phi/kernels/addmm_kernel.h index 3674305796cde..a04235e090462 100644 --- a/paddle/phi/kernels/addmm_kernel.h +++ b/paddle/phi/kernels/addmm_kernel.h @@ -23,8 +23,8 @@ void AddmmKernel(const Context& dev_ctx, const DenseTensor& input, const DenseTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index 7927f595927fc..c3df87de24349 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -173,8 +173,8 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold); DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, CELUGradFunctor, alpha); -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, - BReluGradFunctor, +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, + HardTanhGradFunctor, t_min, t_max); @@ -263,7 +263,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) -PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_tanh_grad, HardTanhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index ac75c4ad3479f..4cbe46bd6658f 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -104,7 +104,7 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Celu, CELUFunctor, alpha) -DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh, HardTanhFunctor, t_min, t_max) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, @@ -146,7 +146,7 @@ PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) -PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_tanh, HardTanhKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel) diff --git a/paddle/phi/kernels/cpu/crop_tensor_kernel.cc b/paddle/phi/kernels/cpu/crop_grad_kernel.cc similarity index 82% rename from paddle/phi/kernels/cpu/crop_tensor_kernel.cc rename to paddle/phi/kernels/cpu/crop_grad_kernel.cc index 8cd42d5fa8239..6d689c9413397 100644 --- a/paddle/phi/kernels/cpu/crop_tensor_kernel.cc +++ b/paddle/phi/kernels/cpu/crop_grad_kernel.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/crop_tensor_kernel.h" +#include "paddle/phi/kernels/crop_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/crop_tensor_kernel_impl.h" +#include "paddle/phi/kernels/impl/crop_grad_kernel_impl.h" -PD_REGISTER_KERNEL(crop_tensor, +PD_REGISTER_KERNEL(crop_grad, CPU, ALL_LAYOUT, - phi::CropTensorKernel, + phi::CropGradKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc b/paddle/phi/kernels/cpu/crop_kernel.cc similarity index 66% rename from paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc rename to paddle/phi/kernels/cpu/crop_kernel.cc index 6ac553ec9786b..a317cada25be2 100644 --- a/paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/crop_kernel.cc @@ -12,17 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/crop_tensor_grad_kernel.h" +#include "paddle/phi/kernels/crop_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/crop_kernel_impl.h" -PD_REGISTER_KERNEL(crop_tensor_grad, - CPU, - ALL_LAYOUT, - phi::CropTensorGradKernel, - float, - double, - int, - int64_t) {} +PD_REGISTER_KERNEL( + crop, CPU, ALL_LAYOUT, phi::CropKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_kernel.cc similarity index 68% rename from paddle/phi/kernels/cpu/gaussian_random_kernel.cc rename to paddle/phi/kernels/cpu/gaussian_kernel.cc index c600149cbbacc..c4efc508aac83 100644 --- a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/gaussian_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/gaussian_random_kernel.h" +#include "paddle/phi/kernels/gaussian_kernel.h" #include "paddle/fluid/framework/generator.h" #include "paddle/phi/backends/cpu/cpu_context.h" @@ -21,13 +21,13 @@ namespace phi { template -void GaussianRandomKernel(const Context& dev_ctx, - const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - DenseTensor* out) { +void GaussianKernel(const Context& dev_ctx, + const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { auto tensor = out; std::normal_distribution dist(mean, std); @@ -44,9 +44,5 @@ void GaussianRandomKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(gaussian_random, - CPU, - ALL_LAYOUT, - phi::GaussianRandomKernel, - float, - double) {} +PD_REGISTER_KERNEL( + gaussian, CPU, ALL_LAYOUT, phi::GaussianKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_u_recv_grad_kernel.cc similarity index 89% rename from paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc rename to paddle/phi/kernels/cpu/send_u_recv_grad_kernel.cc index d4131a1ffb5e3..f5fcc82b9b9e2 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/send_u_recv_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" +#include "paddle/phi/kernels/send_u_recv_grad_kernel.h" #include #include @@ -117,15 +117,15 @@ void GraphSendRecvGradOpKernelLaunchHelper( } template -void GraphSendRecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& reduce_op, - DenseTensor* x_grad) { +void SendURecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& reduce_op, + DenseTensor* x_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendRecvGradOpKernelLaunchHelper( @@ -154,10 +154,10 @@ void GraphSendRecvGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_recv_grad, +PD_REGISTER_KERNEL(send_u_recv_grad, CPU, ALL_LAYOUT, - phi::GraphSendRecvGradKernel, + phi::SendURecvGradKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/send_u_recv_kernel.cc similarity index 93% rename from paddle/phi/kernels/cpu/graph_send_recv_kernel.cc rename to paddle/phi/kernels/cpu/send_u_recv_kernel.cc index 7985a65a20053..e27be3244cb7a 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/send_u_recv_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_recv_kernel.h" +#include "paddle/phi/kernels/send_u_recv_kernel.h" #include #include @@ -144,14 +144,14 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, } template -void GraphSendRecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& reduce_op, - const IntArray& out_size, - DenseTensor* out, - DenseTensor* dst_count) { +void SendURecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + DenseTensor* out, + DenseTensor* dst_count) { auto index_type = src_index.dtype(); auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { @@ -177,10 +177,10 @@ void GraphSendRecvKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_recv, +PD_REGISTER_KERNEL(send_u_recv, CPU, ALL_LAYOUT, - phi::GraphSendRecvKernel, + phi::SendURecvKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc similarity index 95% rename from paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc rename to paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc index 912426a778d0c..5c951b9bb099f 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h" +#include "paddle/phi/kernels/send_ue_recv_grad_kernel.h" #include #include @@ -443,18 +443,18 @@ void GraphSendUERecvGradOpKernelLaunchHelper( } template -void GraphSendUERecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& message_op, - const std::string& reduce_op, - DenseTensor* x_grad, - DenseTensor* y_grad) { +void SendUERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& message_op, + const std::string& reduce_op, + DenseTensor* x_grad, + DenseTensor* y_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUERecvGradOpKernelLaunchHelper( @@ -489,10 +489,10 @@ void GraphSendUERecvGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_ue_recv_grad, +PD_REGISTER_KERNEL(send_ue_recv_grad, CPU, ALL_LAYOUT, - phi::GraphSendUERecvGradKernel, + phi::SendUERecvGradKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/send_ue_recv_kernel.cc similarity index 94% rename from paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc rename to paddle/phi/kernels/cpu/send_ue_recv_kernel.cc index ab9adc3897170..4297077f383c8 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/send_ue_recv_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_ue_recv_kernel.h" +#include "paddle/phi/kernels/send_ue_recv_kernel.h" #include #include @@ -244,16 +244,16 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, } template -void GraphSendUERecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& message_op, - const std::string& reduce_op, - const IntArray& out_size, - DenseTensor* out, - DenseTensor* dst_count) { +void SendUERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& message_op, + const std::string& reduce_op, + const IntArray& out_size, + DenseTensor* out, + DenseTensor* dst_count) { auto index_type = src_index.dtype(); auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { @@ -283,10 +283,10 @@ void GraphSendUERecvKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_ue_recv, +PD_REGISTER_KERNEL(send_ue_recv, CPU, ALL_LAYOUT, - phi::GraphSendUERecvKernel, + phi::SendUERecvKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc similarity index 94% rename from paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc rename to paddle/phi/kernels/cpu/send_uv_grad_kernel.cc index 23e5172c3afa7..cc0af1065e3f3 100644 --- a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_uv_grad_kernel.h" +#include "paddle/phi/kernels/send_uv_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/hostdevice.h" @@ -229,15 +229,15 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx, } template -void GraphSendUVGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const DenseTensor& out_grad, - const std::string& message_op, - DenseTensor* x_grad, - DenseTensor* y_grad) { +void SendUVGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const DenseTensor& out_grad, + const std::string& message_op, + DenseTensor* x_grad, + DenseTensor* y_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUVGradOpKernelLaunchHelper( @@ -250,10 +250,10 @@ void GraphSendUVGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_uv_grad, +PD_REGISTER_KERNEL(send_uv_grad, CPU, ALL_LAYOUT, - phi::GraphSendUVGradKernel, + phi::SendUVGradKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc b/paddle/phi/kernels/cpu/send_uv_kernel.cc similarity index 88% rename from paddle/phi/kernels/cpu/graph_send_uv_kernel.cc rename to paddle/phi/kernels/cpu/send_uv_kernel.cc index 2183eb2a4c593..4d235c4dd7d12 100644 --- a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc +++ b/paddle/phi/kernels/cpu/send_uv_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_uv_kernel.h" +#include "paddle/phi/kernels/send_uv_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/hostdevice.h" @@ -102,13 +102,13 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx, } template -void GraphSendUVKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& message_op, - DenseTensor* out) { +void SendUVKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& message_op, + DenseTensor* out) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUVOpKernelLaunchHelper( @@ -121,11 +121,5 @@ void GraphSendUVKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_uv, - CPU, - ALL_LAYOUT, - phi::GraphSendUVKernel, - float, - double, - int, - int64_t) {} +PD_REGISTER_KERNEL( + send_uv, CPU, ALL_LAYOUT, phi::SendUVKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/crop_tensor_kernel.h b/paddle/phi/kernels/crop_grad_kernel.h similarity index 78% rename from paddle/phi/kernels/crop_tensor_kernel.h rename to paddle/phi/kernels/crop_grad_kernel.h index 079959eb05c14..d51ff2366e68b 100644 --- a/paddle/phi/kernels/crop_tensor_kernel.h +++ b/paddle/phi/kernels/crop_grad_kernel.h @@ -20,10 +20,10 @@ namespace phi { template -void CropTensorKernel(const Context& dev_ctx, - const DenseTensor& x, - const IntArray& shape, - const IntArray& offsets, - DenseTensor* out); +void CropGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const IntArray& offsets, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/crop_tensor_grad_kernel.h b/paddle/phi/kernels/crop_kernel.h similarity index 76% rename from paddle/phi/kernels/crop_tensor_grad_kernel.h rename to paddle/phi/kernels/crop_kernel.h index 97f1fbf5b029a..97fc49492b13d 100644 --- a/paddle/phi/kernels/crop_tensor_grad_kernel.h +++ b/paddle/phi/kernels/crop_kernel.h @@ -20,10 +20,10 @@ namespace phi { template -void CropTensorGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const IntArray& offsets, - DenseTensor* x_grad); +void CropKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& shape, + const IntArray& offsets, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 2af106ca38c48..d0ab1e7f2b372 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -956,7 +956,7 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor { }; template -struct BReluFunctor : public BaseActivationFunctor { +struct HardTanhFunctor : public BaseActivationFunctor { float t_min; float t_max; @@ -974,7 +974,7 @@ struct BReluFunctor : public BaseActivationFunctor { }; template -struct BReluGradFunctor : public BaseActivationFunctor { +struct HardTanhGradFunctor : public BaseActivationFunctor { float t_min; float t_max; typename BaseActivationFunctor::AttrPair GetAttrs() { @@ -2707,7 +2707,7 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { }; template -struct CudaBReluFunctor : public BaseActivationFunctor { +struct CudaHardTanhFunctor : public BaseActivationFunctor { float t_min; float t_max; @@ -2775,7 +2775,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { }; template -struct CudaBReluGradFunctor : public BaseActivationFunctor { +struct CudaHardTanhGradFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); float t_min; float t_max; diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_kernel.h similarity index 74% rename from paddle/phi/kernels/gaussian_random_kernel.h rename to paddle/phi/kernels/gaussian_kernel.h index 7424ad484a1fd..a04c8802cf385 100644 --- a/paddle/phi/kernels/gaussian_random_kernel.h +++ b/paddle/phi/kernels/gaussian_kernel.h @@ -21,12 +21,12 @@ namespace phi { template -void GaussianRandomKernel(const Context& ctx, - const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - DenseTensor* out); +void GaussianKernel(const Context& ctx, + const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index b947c70cb89d4..c93baf86da950 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -228,8 +228,8 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6, CudaRelu6GradFunctor, threshold); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, - CudaBReluGradFunctor, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, + CudaHardTanhGradFunctor, t_min, t_max); @@ -346,7 +346,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel) -PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_tanh_grad, HardTanhGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index e57332c40756a..e22c477d852cb 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -122,7 +122,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Celu, CudaCELUFunctor, alpha) -DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh, + CudaHardTanhFunctor, + t_min, + t_max) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, CudaSoftplusFunctor, @@ -193,7 +196,7 @@ PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) -PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_tanh, HardTanhKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) diff --git a/paddle/phi/kernels/gpu/crop_tensor_kernel.cu b/paddle/phi/kernels/gpu/crop_grad_kernel.cu similarity index 82% rename from paddle/phi/kernels/gpu/crop_tensor_kernel.cu rename to paddle/phi/kernels/gpu/crop_grad_kernel.cu index 5aa4900c5097b..f8ee07abc638d 100644 --- a/paddle/phi/kernels/gpu/crop_tensor_kernel.cu +++ b/paddle/phi/kernels/gpu/crop_grad_kernel.cu @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/crop_tensor_kernel.h" +#include "paddle/phi/kernels/crop_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/crop_tensor_kernel_impl.h" +#include "paddle/phi/kernels/impl/crop_grad_kernel_impl.h" -PD_REGISTER_KERNEL(crop_tensor, +PD_REGISTER_KERNEL(crop_grad, GPU, ALL_LAYOUT, - phi::CropTensorKernel, + phi::CropGradKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/crop_kernel.cu similarity index 66% rename from paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu rename to paddle/phi/kernels/gpu/crop_kernel.cu index 0af80233cb1ef..ee9b6a160c48d 100644 --- a/paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/crop_kernel.cu @@ -12,17 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/crop_tensor_grad_kernel.h" +#include "paddle/phi/kernels/crop_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/crop_kernel_impl.h" -PD_REGISTER_KERNEL(crop_tensor_grad, - GPU, - ALL_LAYOUT, - phi::CropTensorGradKernel, - float, - double, - int, - int64_t) {} +PD_REGISTER_KERNEL( + crop, GPU, ALL_LAYOUT, phi::CropKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_kernel.cu similarity index 86% rename from paddle/phi/kernels/gpu/gaussian_random_kernel.cu rename to paddle/phi/kernels/gpu/gaussian_kernel.cu index f1edca70777fe..6caf56c3b5127 100644 --- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/gaussian_random_kernel.h" +#include "paddle/phi/kernels/gaussian_kernel.h" #include @@ -52,13 +52,13 @@ struct GaussianGenerator { }; template -void GaussianRandomKernel(const Context& dev_ctx, - const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - DenseTensor* out) { +void GaussianKernel(const Context& dev_ctx, + const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); dev_ctx.template Alloc(out); if (seed == 0) { @@ -78,10 +78,10 @@ void GaussianRandomKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(gaussian_random, +PD_REGISTER_KERNEL(gaussian, GPU, ALL_LAYOUT, - phi::GaussianRandomKernel, + phi::GaussianKernel, phi::dtype::float16, phi::dtype::bfloat16, float, diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index e352c50bdc283..180a9dfac854b 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/graph_send_recv_kernel.h" +#include "paddle/phi/kernels/send_u_recv_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu similarity index 86% rename from paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu rename to paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu index d058ee63c3d2f..a7e4e32ed1d17 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" +#include "paddle/phi/kernels/send_u_recv_grad_kernel.h" #include #include @@ -98,15 +98,15 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( } template -void GraphSendRecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& reduce_op, - DenseTensor* x_grad) { +void SendURecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& reduce_op, + DenseTensor* x_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendRecvGradOpCUDAKernelLaunchHelper( @@ -135,10 +135,10 @@ void GraphSendRecvGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_recv_grad, +PD_REGISTER_KERNEL(send_u_recv_grad, GPU, ALL_LAYOUT, - phi::GraphSendRecvGradKernel, + phi::SendURecvGradKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu similarity index 93% rename from paddle/phi/kernels/gpu/graph_send_recv_kernel.cu rename to paddle/phi/kernels/gpu/send_u_recv_kernel.cu index 055d4888e3f56..0f000af536d8b 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_recv_kernel.h" +#include "paddle/phi/kernels/send_u_recv_kernel.h" #include #include @@ -154,14 +154,14 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, } template -void GraphSendRecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& reduce_op, - const IntArray& out_size, - DenseTensor* out, - DenseTensor* dst_count) { +void SendURecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + DenseTensor* out, + DenseTensor* dst_count) { auto index_type = src_index.dtype(); auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { @@ -187,10 +187,10 @@ void GraphSendRecvKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_recv, +PD_REGISTER_KERNEL(send_u_recv, GPU, ALL_LAYOUT, - phi::GraphSendRecvKernel, + phi::SendURecvKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu similarity index 96% rename from paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu rename to paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu index 41667be1a9545..281bf6278afa4 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h" +#include "paddle/phi/kernels/send_ue_recv_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" @@ -556,18 +556,18 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( } template -void GraphSendUERecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& message_op, - const std::string& reduce_op, - DenseTensor* x_grad, - DenseTensor* y_grad) { +void SendUERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& message_op, + const std::string& reduce_op, + DenseTensor* x_grad, + DenseTensor* y_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUERecvGradOpCUDAKernelLaunchHelper( @@ -602,10 +602,10 @@ void GraphSendUERecvGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_ue_recv_grad, +PD_REGISTER_KERNEL(send_ue_recv_grad, GPU, ALL_LAYOUT, - phi::GraphSendUERecvGradKernel, + phi::SendUERecvGradKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu similarity index 94% rename from paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu rename to paddle/phi/kernels/gpu/send_ue_recv_kernel.cu index 8a5897316ca9c..482077b7f93bf 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu @@ -12,10 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_ue_recv_kernel.h" -#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" -#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_message_passing_impl.h" +#include "paddle/phi/kernels/send_ue_recv_kernel.h" #include #include @@ -26,6 +23,9 @@ #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_message_passing_impl.h" namespace phi { @@ -282,16 +282,16 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, } template -void GraphSendUERecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& message_op, - const std::string& reduce_op, - const IntArray& out_size, - DenseTensor* out, - DenseTensor* dst_count) { +void SendUERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& message_op, + const std::string& reduce_op, + const IntArray& out_size, + DenseTensor* out, + DenseTensor* dst_count) { auto index_type = src_index.dtype(); auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { @@ -323,10 +323,10 @@ void GraphSendUERecvKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_ue_recv, +PD_REGISTER_KERNEL(send_ue_recv, GPU, ALL_LAYOUT, - phi::GraphSendUERecvKernel, + phi::SendUERecvKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu similarity index 95% rename from paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu rename to paddle/phi/kernels/gpu/send_uv_grad_kernel.cu index 1671fa7e17cd0..ad904c8ae2d88 100644 --- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_uv_grad_kernel.h" +#include "paddle/phi/kernels/send_uv_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" @@ -285,15 +285,15 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx, } template -void GraphSendUVGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const DenseTensor& out_grad, - const std::string& message_op, - DenseTensor* x_grad, - DenseTensor* y_grad) { +void SendUVGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const DenseTensor& out_grad, + const std::string& message_op, + DenseTensor* x_grad, + DenseTensor* y_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUVGradOpCUDAKernelLaunchHelper( @@ -306,10 +306,10 @@ void GraphSendUVGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_uv_grad, +PD_REGISTER_KERNEL(send_uv_grad, GPU, ALL_LAYOUT, - phi::GraphSendUVGradKernel, + phi::SendUVGradKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu b/paddle/phi/kernels/gpu/send_uv_kernel.cu similarity index 92% rename from paddle/phi/kernels/gpu/graph_send_uv_kernel.cu rename to paddle/phi/kernels/gpu/send_uv_kernel.cu index 32b8b014d0c22..69c1515e8124d 100644 --- a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_uv_kernel.cu @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_uv_kernel.h" -#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_message_passing_impl.h" +#include "paddle/phi/kernels/send_uv_kernel.h" #include @@ -22,6 +20,8 @@ #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_message_passing_impl.h" namespace phi { @@ -142,13 +142,13 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx, } template -void GraphSendUVKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& message_op, - DenseTensor* out) { +void SendUVKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& message_op, + DenseTensor* out) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUVOpCUDAKernelLaunchHelper( @@ -161,10 +161,10 @@ void GraphSendUVKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_uv, +PD_REGISTER_KERNEL(send_uv, GPU, ALL_LAYOUT, - phi::GraphSendUVKernel, + phi::SendUVKernel, float, double, int, diff --git a/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h deleted file mode 100644 index 74050d126259d..0000000000000 --- a/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/utils/optional.h" - -namespace phi { - -template -void GraphSendUERecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& message_op, - const std::string& reduce_op, - DenseTensor* x_grad, - DenseTensor* y_grad); -} // namespace phi diff --git a/paddle/phi/kernels/graph_send_ue_recv_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_kernel.h deleted file mode 100644 index a308a78800f3a..0000000000000 --- a/paddle/phi/kernels/graph_send_ue_recv_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/phi/common/int_array.h" -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void GraphSendUERecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& message_op, - const std::string& reduce_op, - const IntArray& out_size, - DenseTensor* out, - DenseTensor* dst_count); - -} // namespace phi diff --git a/paddle/phi/kernels/graph_send_uv_grad_kernel.h b/paddle/phi/kernels/graph_send_uv_grad_kernel.h deleted file mode 100644 index fa2285627a4b7..0000000000000 --- a/paddle/phi/kernels/graph_send_uv_grad_kernel.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void GraphSendUVGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const DenseTensor& out_grad, - const std::string& message_op, - DenseTensor* x_grad, - DenseTensor* y_grad); - -} // namespace phi diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h index 41f3f4b39c98a..151e8f7420acc 100644 --- a/paddle/phi/kernels/impl/addmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h @@ -37,8 +37,8 @@ void AddmmKernel(const Context& dev_ctx, const DenseTensor& input, const DenseTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out) { auto input_dims = input.dims(); auto x_dims = x.dims(); diff --git a/paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h b/paddle/phi/kernels/impl/crop_grad_kernel_impl.h similarity index 91% rename from paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h rename to paddle/phi/kernels/impl/crop_grad_kernel_impl.h index 0d3e579fe8bc8..583d495a5da2f 100644 --- a/paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/crop_grad_kernel_impl.h @@ -15,7 +15,7 @@ #pragma once -#include "paddle/phi/kernels/crop_tensor_grad_kernel.h" +#include "paddle/phi/kernels/crop_grad_kernel.h" #include @@ -52,11 +52,11 @@ void CropTensorGradFunction(const Context& dev_ctx, } template -void CropTensorGradKernel(const Context& dev_ctx, - const DenseTensor& out_grad, - const DenseTensor& x, - const IntArray& offsets, - DenseTensor* x_grad) { +void CropGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const IntArray& offsets, + DenseTensor* x_grad) { size_t rank = out_grad.dims().size(); PADDLE_ENFORCE_GE( rank, diff --git a/paddle/phi/kernels/impl/crop_tensor_kernel_impl.h b/paddle/phi/kernels/impl/crop_kernel_impl.h similarity index 95% rename from paddle/phi/kernels/impl/crop_tensor_kernel_impl.h rename to paddle/phi/kernels/impl/crop_kernel_impl.h index e6d7f8f672659..d3cb672104d67 100644 --- a/paddle/phi/kernels/impl/crop_tensor_kernel_impl.h +++ b/paddle/phi/kernels/impl/crop_kernel_impl.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/phi/kernels/crop_tensor_kernel.h" +#include "paddle/phi/kernels/crop_kernel.h" #include #include @@ -127,11 +127,11 @@ void CropTensorFunction(const Context& dev_ctx, } template -void CropTensorKernel(const Context& dev_ctx, - const DenseTensor& x, - const IntArray& shape, - const IntArray& offsets, - DenseTensor* out) { +void CropKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& shape, + const IntArray& offsets, + DenseTensor* out) { int rank = x.dims().size(); PADDLE_ENFORCE_GE( rank, diff --git a/paddle/phi/kernels/onednn/gaussian_random_kernel.cc b/paddle/phi/kernels/onednn/gaussian_kernel.cc similarity index 76% rename from paddle/phi/kernels/onednn/gaussian_random_kernel.cc rename to paddle/phi/kernels/onednn/gaussian_kernel.cc index 77331e5c32ea1..bd293d553930c 100644 --- a/paddle/phi/kernels/onednn/gaussian_random_kernel.cc +++ b/paddle/phi/kernels/onednn/gaussian_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/gaussian_random_kernel.h" +#include "paddle/phi/kernels/gaussian_kernel.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,13 +20,13 @@ namespace phi { template -void GaussianRandomKernel(const Context& ctx, - const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - DenseTensor* out) { +void GaussianKernel(const Context& ctx, + const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { std::normal_distribution dist(mean, std); std::shared_ptr engine; if (seed) { @@ -51,5 +51,4 @@ void GaussianRandomKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL( - gaussian_random, OneDNN, ONEDNN, phi::GaussianRandomKernel, float) {} +PD_REGISTER_KERNEL(gaussian, OneDNN, ONEDNN, phi::GaussianKernel, float) {} diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/send_u_recv_grad_kernel.h similarity index 75% rename from paddle/phi/kernels/graph_send_recv_kernel.h rename to paddle/phi/kernels/send_u_recv_grad_kernel.h index 023e86064ff51..1acb3bd7f14c4 100644 --- a/paddle/phi/kernels/graph_send_recv_kernel.h +++ b/paddle/phi/kernels/send_u_recv_grad_kernel.h @@ -16,19 +16,19 @@ #include -#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" namespace phi { template -void GraphSendRecvKernel(const Context& ctx, +void SendURecvGradKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, const std::string& reduce_op, - const IntArray& out_size, - DenseTensor* out, - DenseTensor* dst_count); - + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/send_u_recv_kernel.h b/paddle/phi/kernels/send_u_recv_kernel.h new file mode 100644 index 0000000000000..197d8452b4779 --- /dev/null +++ b/paddle/phi/kernels/send_u_recv_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/common/int_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SendURecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + DenseTensor* out, + DenseTensor* dst_count); + +} // namespace phi diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/send_ue_recv_grad_kernel.h similarity index 55% rename from paddle/phi/kernels/graph_send_recv_grad_kernel.h rename to paddle/phi/kernels/send_ue_recv_grad_kernel.h index 1b618c6fede21..798626d265e02 100644 --- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h +++ b/paddle/phi/kernels/send_ue_recv_grad_kernel.h @@ -15,20 +15,22 @@ #pragma once #include - #include "paddle/phi/core/dense_tensor.h" #include "paddle/utils/optional.h" namespace phi { template -void GraphSendRecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& reduce_op, - DenseTensor* x_grad); +void SendUERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& message_op, + const std::string& reduce_op, + DenseTensor* x_grad, + DenseTensor* y_grad); } // namespace phi diff --git a/paddle/phi/kernels/send_ue_recv_kernel.h b/paddle/phi/kernels/send_ue_recv_kernel.h new file mode 100644 index 0000000000000..6353572bfaa28 --- /dev/null +++ b/paddle/phi/kernels/send_ue_recv_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/common/int_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SendUERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& message_op, + const std::string& reduce_op, + const IntArray& out_size, + DenseTensor* out, + DenseTensor* dst_count); + +} // namespace phi diff --git a/paddle/phi/kernels/send_uv_grad_kernel.h b/paddle/phi/kernels/send_uv_grad_kernel.h new file mode 100644 index 0000000000000..b5a6200d1111e --- /dev/null +++ b/paddle/phi/kernels/send_uv_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SendUVGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const DenseTensor& out_grad, + const std::string& message_op, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/graph_send_uv_kernel.h b/paddle/phi/kernels/send_uv_kernel.h similarity index 69% rename from paddle/phi/kernels/graph_send_uv_kernel.h rename to paddle/phi/kernels/send_uv_kernel.h index 7b723122c1a7f..f91342772955c 100644 --- a/paddle/phi/kernels/graph_send_uv_kernel.h +++ b/paddle/phi/kernels/send_uv_kernel.h @@ -20,12 +20,12 @@ namespace phi { template -void GraphSendUVKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& message_op, - DenseTensor* out); +void SendUVKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& message_op, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/sparse/addmm_kernel.h b/paddle/phi/kernels/sparse/addmm_kernel.h index 3cf21fbca2f81..00bf904c2855f 100644 --- a/paddle/phi/kernels/sparse/addmm_kernel.h +++ b/paddle/phi/kernels/sparse/addmm_kernel.h @@ -27,8 +27,8 @@ void AddmmCooCooKernel(const Context& dev_ctx, const SparseCooTensor& input, const SparseCooTensor& x, const SparseCooTensor& y, - float alpha, float beta, + float alpha, SparseCooTensor* out); /* DENSE + COO @ DENSE -> DENSE */ @@ -37,8 +37,8 @@ void AddmmCooDenseKernel(const Context& dev_ctx, const DenseTensor& input, const SparseCooTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out); // TODO(zhouwei25): implement " CSR + CSR @ CSR -> CSR" @@ -47,8 +47,8 @@ void AddmmCsrCsrKernel(const Context& dev_ctx, const SparseCsrTensor& input, const SparseCsrTensor& x, const SparseCsrTensor& y, - float alpha, float beta, + float alpha, SparseCsrTensor* out); /* DENSE + CSR @ DENSE -> DENSE */ @@ -57,8 +57,8 @@ void AddmmCsrDenseKernel(const Context& dev_ctx, const DenseTensor& input, const SparseCsrTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out); } // namespace sparse diff --git a/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc b/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc index e58d9d0e69196..fd702ef88a542 100644 --- a/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/addmm_kernel.cc @@ -25,8 +25,8 @@ void AddmmCooDenseKernel(const Context& dev_ctx, const DenseTensor& input, const SparseCooTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out) { PADDLE_THROW(phi::errors::Unimplemented( "Not support CPU kernel of 'sparse.addmm' now.")); @@ -38,8 +38,8 @@ void AddmmCsrDenseKernel(const Context& dev_ctx, const DenseTensor& input, const SparseCsrTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out) { PADDLE_THROW(phi::errors::Unimplemented( "Not support CPU kernel of 'sparse.addmm' now.")); diff --git a/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu b/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu index 3e5d423b9f96a..1a43009c519b6 100644 --- a/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu @@ -31,8 +31,8 @@ void AddmmKernelImpl(const Context& dev_ctx, const DenseTensor& input, const TensorType& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out) { #if CUDA_VERSION >= 11000 std::vector input_dim = phi::vectorize(input.dims()); @@ -107,10 +107,10 @@ void AddmmCooDenseKernel(const Context& dev_ctx, const DenseTensor& input, const SparseCooTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out) { - AddmmKernelImpl(dev_ctx, input, x, y, alpha, beta, out); + AddmmKernelImpl(dev_ctx, input, x, y, beta, alpha, out); } template @@ -118,10 +118,10 @@ void AddmmCsrDenseKernel(const Context& dev_ctx, const DenseTensor& input, const SparseCsrTensor& x, const DenseTensor& y, - float alpha, float beta, + float alpha, DenseTensor* out) { - AddmmKernelImpl(dev_ctx, input, x, y, alpha, beta, out); + AddmmKernelImpl(dev_ctx, input, x, y, beta, alpha, out); } } // namespace sparse diff --git a/paddle/phi/kernels/xpu/gaussian_random_kernel.cc b/paddle/phi/kernels/xpu/gaussian_kernel.cc similarity index 78% rename from paddle/phi/kernels/xpu/gaussian_random_kernel.cc rename to paddle/phi/kernels/xpu/gaussian_kernel.cc index 913ad2472e9a8..a52a0d429ff80 100644 --- a/paddle/phi/kernels/xpu/gaussian_random_kernel.cc +++ b/paddle/phi/kernels/xpu/gaussian_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/gaussian_random_kernel.h" +#include "paddle/phi/kernels/gaussian_kernel.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/memory/memcpy.h" @@ -22,13 +22,13 @@ namespace phi { template -void GaussianRandomKernel(const Context& ctx, - const IntArray& shape, - float mean, - float std, - int seed, - DataType dtype, - DenseTensor* out) { +void GaussianKernel(const Context& ctx, + const IntArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { std::normal_distribution dist(mean, std); int64_t size = out->numel(); ctx.template Alloc(out); @@ -51,5 +51,4 @@ void GaussianRandomKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL( - gaussian_random, XPU, ALL_LAYOUT, phi::GaussianRandomKernel, float) {} +PD_REGISTER_KERNEL(gaussian, XPU, ALL_LAYOUT, phi::GaussianKernel, float) {} diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index e70e5f72d44c3..990790e4798ed 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -41,7 +41,7 @@ namespace phi { DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Square, "square", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardTanh, "hard_tanh", "t_min" comma "t_max"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, "thresholded_relu", @@ -228,6 +228,8 @@ PD_REGISTER_BASE_KERNEL_NAME(sqrt_grad_grad, sqrt_double_grad); PD_REGISTER_BASE_KERNEL_NAME(rsqrt_grad_grad, rsqrt_double_grad); PD_REGISTER_BASE_KERNEL_NAME(celu_grad_grad, celu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(square_grad_grad, square_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(brelu, hard_tanh); +PD_REGISTER_BASE_KERNEL_NAME(brelu_grad, hard_tanh_grad); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); @@ -252,7 +254,7 @@ PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, phi::TanhDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, phi::TanhTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::HardTanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, phi::LeakyReluGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc index 3919c875f5606..23e3e4f2689e6 100644 --- a/paddle/phi/ops/compat/addmm_sig.cc +++ b/paddle/phi/ops/compat/addmm_sig.cc @@ -16,6 +16,11 @@ namespace phi { +KernelSignature AddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "addmm", {"Input", "X", "Y"}, {"Beta", "Alpha"}, {"Out"}); +} + KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("addmm_grad", {"Input", "X", "Y", "Out@GRAD"}, @@ -25,4 +30,5 @@ KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { } // namespace phi +PD_REGISTER_ARG_MAPPING_FN(addmm, phi::AddmmOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/crop_tensor_sig.cc b/paddle/phi/ops/compat/crop_tensor_sig.cc index 994a7de8fb403..8cf4ddab336bb 100644 --- a/paddle/phi/ops/compat/crop_tensor_sig.cc +++ b/paddle/phi/ops/compat/crop_tensor_sig.cc @@ -20,35 +20,31 @@ KernelSignature CropTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.InputSize("ShapeTensor") > 0) { if (ctx.InputSize("OffsetsTensor") > 0) { return KernelSignature( - "crop_tensor", {"X"}, {"ShapeTensor", "OffsetsTensor"}, {"Out"}); + "crop", {"X"}, {"ShapeTensor", "OffsetsTensor"}, {"Out"}); } else if (ctx.HasInput("Offsets")) { return KernelSignature( - "crop_tensor", {"X"}, {"ShapeTensor", "Offsets"}, {"Out"}); + "crop", {"X"}, {"ShapeTensor", "Offsets"}, {"Out"}); } else { return KernelSignature( - "crop_tensor", {"X"}, {"ShapeTensor", "offsets"}, {"Out"}); + "crop", {"X"}, {"ShapeTensor", "offsets"}, {"Out"}); } } else if (ctx.HasInput("Shape")) { if (ctx.InputSize("OffsetsTensor") > 0) { return KernelSignature( - "crop_tensor", {"X"}, {"Shape", "OffsetsTensor"}, {"Out"}); + "crop", {"X"}, {"Shape", "OffsetsTensor"}, {"Out"}); } else if (ctx.HasInput("Offsets")) { - return KernelSignature( - "crop_tensor", {"X"}, {"Shape", "Offsets"}, {"Out"}); + return KernelSignature("crop", {"X"}, {"Shape", "Offsets"}, {"Out"}); } else { - return KernelSignature( - "crop_tensor", {"X"}, {"Shape", "offsets"}, {"Out"}); + return KernelSignature("crop", {"X"}, {"Shape", "offsets"}, {"Out"}); } } else { if (ctx.InputSize("OffsetsTensor") > 0) { return KernelSignature( - "crop_tensor", {"X"}, {"shape", "OffsetsTensor"}, {"Out"}); + "crop", {"X"}, {"shape", "OffsetsTensor"}, {"Out"}); } else if (ctx.HasInput("Offsets")) { - return KernelSignature( - "crop_tensor", {"X"}, {"shape", "Offsets"}, {"Out"}); + return KernelSignature("crop", {"X"}, {"shape", "Offsets"}, {"Out"}); } else { - return KernelSignature( - "crop_tensor", {"X"}, {"shape", "offsets"}, {"Out"}); + return KernelSignature("crop", {"X"}, {"shape", "offsets"}, {"Out"}); } } } @@ -57,18 +53,21 @@ KernelSignature CropTensorGradOpArgumentMapping( const ArgumentMappingContext& ctx) { if (ctx.InputSize("OffsetsTensor") > 0) { return KernelSignature( - "crop_tensor_grad", {"X", "Out@GRAD"}, {"OffsetsTensor"}, {"X@GRAD"}); + "crop_grad", {"X", "Out@GRAD"}, {"OffsetsTensor"}, {"X@GRAD"}); } else if (ctx.HasInput("Offsets")) { return KernelSignature( - "crop_tensor_grad", {"X", "Out@GRAD"}, {"Offsets"}, {"X@GRAD"}); + "crop_grad", {"X", "Out@GRAD"}, {"Offsets"}, {"X@GRAD"}); } else { return KernelSignature( - "crop_tensor_grad", {"X", "Out@GRAD"}, {"offsets"}, {"X@GRAD"}); + "crop_grad", {"X", "Out@GRAD"}, {"offsets"}, {"X@GRAD"}); } } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(crop_tensor, crop); +PD_REGISTER_BASE_KERNEL_NAME(crop_tensor_grad, crop_grad); + PD_REGISTER_ARG_MAPPING_FN(crop_tensor, phi::CropTensorOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(crop_tensor_grad, phi::CropTensorGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc index 2f2b157e4c0f9..ac0caf0e16dcc 100644 --- a/paddle/phi/ops/compat/gaussian_random_sig.cc +++ b/paddle/phi/ops/compat/gaussian_random_sig.cc @@ -22,13 +22,11 @@ KernelSignature GaussianRandomOpArgumentMapping( if (ctx.InputSize("ShapeTensorList") > 0) { // Infer output shape by Attr("shape") in CompileTime if it is specified. if (!ctx.IsRuntime() && !shape.empty()) { - return KernelSignature("gaussian_random", - {}, - {"shape", "mean", "std", "seed", "dtype"}, - {"Out"}); + return KernelSignature( + "gaussian", {}, {"shape", "mean", "std", "seed", "dtype"}, {"Out"}); } else { return KernelSignature( - "gaussian_random", + "gaussian", {}, {"ShapeTensorList", "mean", "std", "seed", "dtype"}, {"Out"}); @@ -36,19 +34,19 @@ KernelSignature GaussianRandomOpArgumentMapping( } if (ctx.HasInput("ShapeTensor") && shape.empty()) { - return KernelSignature("gaussian_random", + return KernelSignature("gaussian", {}, {"ShapeTensor", "mean", "std", "seed", "dtype"}, {"Out"}); } - return KernelSignature("gaussian_random", - {}, - {"shape", "mean", "std", "seed", "dtype"}, - {"Out"}); + return KernelSignature( + "gaussian", {}, {"shape", "mean", "std", "seed", "dtype"}, {"Out"}); } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(gaussian_random, gaussian); + PD_REGISTER_ARG_MAPPING_FN(gaussian_random, phi::GaussianRandomOpArgumentMapping); diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc index 0ca1a3fae0230..ef8eeae358e09 100644 --- a/paddle/phi/ops/compat/graph_send_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -19,12 +19,12 @@ namespace phi { KernelSignature GraphSendRecvOpArgumentMapping( const ArgumentMappingContext& ctx) { if (ctx.HasInput("Out_size")) { - return KernelSignature("graph_send_recv", + return KernelSignature("send_u_recv", {"X", "Src_index", "Dst_index"}, {"reduce_op", "Out_size"}, {"Out", "Dst_count"}); } else { - return KernelSignature("graph_send_recv", + return KernelSignature("send_u_recv", {"X", "Src_index", "Dst_index"}, {"reduce_op", "out_size"}, {"Out", "Dst_count"}); @@ -34,7 +34,7 @@ KernelSignature GraphSendRecvOpArgumentMapping( KernelSignature GraphSendRecvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "graph_send_recv_grad", + "send_u_recv_grad", {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"reduce_op"}, {"X@GRAD"}); @@ -42,6 +42,9 @@ KernelSignature GraphSendRecvGradOpArgumentMapping( } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv, send_u_recv); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv_grad, send_u_recv_grad); + PD_REGISTER_ARG_MAPPING_FN(graph_send_recv, phi::GraphSendRecvOpArgumentMapping); diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc index 0b2ddcc07e1bb..aab850831ae33 100644 --- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc @@ -19,12 +19,12 @@ namespace phi { KernelSignature GraphSendUERecvOpArgumentMapping( const ArgumentMappingContext& ctx) { if (ctx.HasInput("Out_size")) { - return KernelSignature("graph_send_ue_recv", + return KernelSignature("send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, {"message_op", "reduce_op", "Out_size"}, {"Out", "Dst_count"}); } else { - return KernelSignature("graph_send_ue_recv", + return KernelSignature("send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, {"message_op", "reduce_op", "out_size"}, {"Out", "Dst_count"}); @@ -34,7 +34,7 @@ KernelSignature GraphSendUERecvOpArgumentMapping( KernelSignature GraphSendUERecvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "graph_send_ue_recv_grad", + "send_ue_recv_grad", {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"message_op", "reduce_op"}, {"X@GRAD", "Y@GRAD"}); @@ -42,6 +42,9 @@ KernelSignature GraphSendUERecvGradOpArgumentMapping( } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv, send_ue_recv); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv_grad, send_ue_recv_grad); + PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv, phi::GraphSendUERecvOpArgumentMapping); diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 3e6c9ef3674ac..5cf54f221362b 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -435,7 +435,7 @@ def forward(self, var, block=None): if in_dygraph_mode(): place = _current_expected_place() - out_var = _C_ops.gaussian_random( + out_var = _C_ops.gaussian( var.shape, self._mean, self._std_dev, @@ -737,7 +737,7 @@ def forward(self, var, block=None): if in_dygraph_mode(): place = _current_expected_place() - out_var = _C_ops.gaussian_random( + out_var = _C_ops.gaussian( out_var.shape, 0.0, std, self._seed, out_dtype, place ) else: @@ -949,7 +949,7 @@ def forward(self, var, block=None): std = gain / math.sqrt(float(fan_in)) if in_dygraph_mode(): place = _current_expected_place() - out_var = _C_ops.gaussian_random( + out_var = _C_ops.gaussian( out_var.shape, 0.0, std, self._seed, out_dtype, place ) else: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 525558cb77b79..17bd7ff86dbf9 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -11837,7 +11837,7 @@ def gaussian_random( if in_dygraph_mode(): shape = utils.convert_shape_to_list(shape) place = _current_expected_place() - return _C_ops.gaussian_random( + return _C_ops.gaussian( shape, float(mean), float(std), seed, dtype, place ) diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index b6131d86edd52..047a2e45c4b98 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -133,7 +133,7 @@ def send_u_recv( return out if in_dygraph_mode(): out_size = convert_out_size_to_list(out_size) - return _C_ops.graph_send_recv( + return _C_ops.send_u_recv( x, src_index, dst_index, reduce_op.upper(), out_size ) @@ -320,7 +320,7 @@ def send_ue_recv( return out if in_dygraph_mode(): out_size = convert_out_size_to_list(out_size) - return _C_ops.graph_send_ue_recv( + return _C_ops.send_ue_recv( x, y, src_index, @@ -464,16 +464,14 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None): y = 1.0 / (y + 1e-12) if in_dygraph_mode(): - return _C_ops.graph_send_uv( - x, y, src_index, dst_index, message_op.upper() - ) + return _C_ops.send_uv(x, y, src_index, dst_index, message_op.upper()) else: if _in_legacy_dygraph(): return _legacy_C_ops.graph_send_uv( x, y, src_index, dst_index, "message_op", message_op.upper() ) else: - helper = LayerHelper("send_uv", **locals()) + helper = LayerHelper("graph_send_uv", **locals()) check_variable_and_dtype( x, 'x', diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py index 88fc421f4a09a..73edc03cf4414 100644 --- a/python/paddle/incubate/operators/graph_send_recv.py +++ b/python/paddle/incubate/operators/graph_send_recv.py @@ -139,7 +139,7 @@ def graph_send_recv( return out if in_dygraph_mode(): out_size = convert_out_size_to_list(out_size) - return _C_ops.graph_send_recv( + return _C_ops.send_u_recv( x, src_index, dst_index, pool_type.upper(), out_size ) diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 254ab7ff8a005..0f0682cde61ac 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -288,7 +288,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None): """ if in_dygraph_mode(): - return _C_ops.brelu(x, min, max) + return _C_ops.hardtanh(x, min, max) if _in_legacy_dygraph(): return _legacy_C_ops.brelu(x, 't_min', min, 't_max', max) diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py index 84fabba483deb..63c65211c631b 100644 --- a/python/paddle/nn/initializer/orthogonal.py +++ b/python/paddle/nn/initializer/orthogonal.py @@ -106,7 +106,7 @@ def __call__(self, var, block=None): if framework.in_dygraph_mode(): with no_grad(): place = framework._current_expected_place() - normal_var = _C_ops.gaussian_random( + normal_var = _C_ops.gaussian( flatten_shape, 0.0, 1.0, self._seed, var.dtype, place ) q, r = _C_ops.qr(normal_var, 'reduced') diff --git a/python/paddle/sparse/multiary.py b/python/paddle/sparse/multiary.py index 32f12b97c2209..a09611d2d0f51 100644 --- a/python/paddle/sparse/multiary.py +++ b/python/paddle/sparse/multiary.py @@ -79,4 +79,4 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None): out = paddle.sparse.addmm(input, x, y, 3.0, 2.0) """ - return _C_ops.sparse_addmm(input, x, y, alpha, beta) + return _C_ops.sparse_addmm(input, x, y, beta, alpha) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index ed0c196efff23..7c629a556b097 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -716,7 +716,7 @@ def crop(x, shape=None, offsets=None, name=None): shape = x.shape if in_dygraph_mode(): - return _C_ops.crop_tensor(x, shape, offsets) + return _C_ops.crop(x, shape, offsets) out = helper.create_variable_for_type_inference(x.dtype) ipts = {'X': x} diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 34bc3b006b3d9..f367215cd1d3d 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1941,7 +1941,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None): ) if in_dygraph_mode(): - return _C_ops.addmm(input, x, y, alpha, beta) + return _C_ops.addmm(input, x, y, beta, alpha) else: if _in_legacy_dygraph(): out = _legacy_C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 2449f9c3194e7..8791ebb7af268 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -257,7 +257,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None): if in_dygraph_mode(): shape = utils.convert_shape_to_list(shape) place = _current_expected_place() - return _C_ops.gaussian_random( + return _C_ops.gaussian( shape, float(mean), float(std), seed, dtype, place ) From 5158fa4f3736be7b22dbcabfa09d5244478d25a1 Mon Sep 17 00:00:00 2001 From: umiswing Date: Tue, 1 Nov 2022 00:03:52 +0800 Subject: [PATCH 46/91] =?UTF-8?q?summer-ospp=202022:=20=E9=A3=9E=E6=A1=A8P?= =?UTF-8?q?addlePaddle=20Sparse=20Conv=E5=BC=80=E5=8F=91=E5=92=8C=E4=BC=98?= =?UTF-8?q?=E5=8C=96:=20gather-gemm-scatter=20fuse=20(#46679)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/external/cutlass.cmake | 43 ++ cmake/third_party.cmake | 10 + paddle/phi/kernels/sparse/gpu/conv_kernel.cu | 201 +++++-- .../kernels/sparse/gpu/gather_gemm_scatter.cu | 188 ++++++ .../kernels/sparse/gpu/gather_gemm_scatter.h | 555 ++++++++++++++++++ 5 files changed, 941 insertions(+), 56 deletions(-) create mode 100644 cmake/external/cutlass.cmake create mode 100644 paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.cu create mode 100644 paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake new file mode 100644 index 0000000000000..a80a729a13957 --- /dev/null +++ b/cmake/external/cutlass.cmake @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass) + +set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git) +set(CUTLASS_TAG v2.9.1) + +include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/") +include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/") +include_directories( + "${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/tools/util/include/") + +add_definitions("-DPADDLE_WITH_CUTLASS") + +ExternalProject_Add( + extern_cutlass + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CUTLASS_REPOSITORY} + GIT_TAG "${CUTLASS_TAG}" + PREFIX ${CUTLASS_PREFIX_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") + +add_library(cutlass INTERFACE) + +add_dependencies(cutlass extern_cutlass) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 06ca0d16df033..4475f5b14d28e 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -505,4 +505,14 @@ if(WITH_CUSPARSELT) list(APPEND third_party_deps extern_cusparselt) endif() +if(WITH_GPU + AND NOT WITH_ARM + AND NOT WIN32 + AND NOT APPLE) + if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0) + include(external/cutlass) # download, build, install cusparselt + list(APPEND third_party_deps extern_cutlass) + endif() +endif() + add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu index 282033e62e357..e5e3cd0f5c184 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu @@ -22,6 +22,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/scatter.cu.h" #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/gpu/conv.cu.h" +#ifdef PADDLE_WITH_CUTLASS +#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h" +#endif #include "glog/logging.h" @@ -120,29 +123,6 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter); } - // 2. gather - phi::DenseTensor in_features = - phi::Empty(dev_ctx, {rulebook_len, in_channels}); - phi::DenseTensor out_features = - phi::Empty(dev_ctx, {rulebook_len, out_channels}); - T* in_features_ptr = in_features.data(); - T* out_features_ptr = out_features.data(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, &out_features, static_cast(0.0f)); - - Gather(dev_ctx, - x.values().data(), - rulebook_ptr, - rulebook_len, - in_channels, - in_features_ptr); - - // 3. call gemm for every werght - auto blas = phi::funcs::GetBlas(dev_ctx); - auto* out_values = out->mutable_values(); - T* out_values_ptr = out_values->data(); - set_zero(dev_ctx, out_values, static_cast(0.0f)); - if (subm) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); @@ -162,43 +142,152 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, out_index_ptr, unique_value_ptr); } +#ifdef PADDLE_WITH_CUTLASS + bool cutlass = true; + if (dev_ctx.GetComputeCapability() < 80) cutlass = false; + if (in_channels % 4 != 0 || out_channels % 4 != 0) { + if (std::is_same::value) cutlass = false; + if (std::is_same::value) cutlass = false; + } + if (!std::is_same::value) cutlass = false; + if (cutlass) { + auto* out_values = out->mutable_non_zero_elements(); + T* out_values_ptr = out_values->data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out_values, static_cast(0.0f)); + + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (h_counter_ptr[i] <= 0) { + continue; + } - const T* kernel_ptr = kernel.data(); - for (int i = 0; i < kernel_size; i++) { - if (h_counter_ptr[i] <= 0) { - continue; + const int M = h_counter_ptr[i]; + const int K = in_channels; + const int N = out_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * K * N; + const IntT* gather_indices = rulebook_ptr + h_offsets_ptr[i]; + const IntT* scatter_indices = + rulebook_ptr + rulebook_len + h_offsets_ptr[i]; + + if constexpr (std::is_same::value && + std::is_same::value) { + fp16_gather_gemm_scatter gather_gemm_scatter = + getBestFp16Kernel(M, N, K); + gather_gemm_scatter( + dev_ctx, + reinterpret_cast( + x.non_zero_elements().data()), + reinterpret_cast(tmp_kernel_ptr), + reinterpret_cast(out_values_ptr), + reinterpret_cast(out_values_ptr), + M, + N, + K, + static_cast(gather_indices), + static_cast(scatter_indices), + static_cast(1), + static_cast(1)); + } + if constexpr (std::is_same::value && + std::is_same::value) { + fp32_gather_gemm_scatter gather_gemm_scatter = + getBestFp32Kernel(M, N, K); + gather_gemm_scatter(dev_ctx, + x.non_zero_elements().data(), + tmp_kernel_ptr, + out_values_ptr, + out_values_ptr, + M, + N, + K, + gather_indices, + scatter_indices, + static_cast(1), + static_cast(1)); + } + if constexpr (std::is_same::value && + std::is_same::value) { + fp64_gather_gemm_scatter gather_gemm_scatter = + getBestFp64Kernel(M, N, K); + gather_gemm_scatter(dev_ctx, + x.non_zero_elements().data(), + tmp_kernel_ptr, + out_values_ptr, + out_values_ptr, + M, + N, + K, + gather_indices, + scatter_indices, + static_cast(1), + static_cast(1)); + } } + } else { +#endif + // 2. gather + phi::DenseTensor in_features = + phi::Empty(dev_ctx, {rulebook_len, in_channels}); + phi::DenseTensor out_features = + phi::Empty(dev_ctx, {rulebook_len, out_channels}); + T* in_features_ptr = in_features.data(); + T* out_features_ptr = out_features.data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, &out_features, static_cast(0.0f)); - // call gemm: (n, in_channels) * (in_channels, out_channels) - const int M = h_counter_ptr[i]; - const int K = in_channels; - const int N = out_channels; - T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels; - const T* tmp_kernel_ptr = kernel_ptr + i * K * N; - T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels; - - blas.GEMM(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - static_cast(1), - tmp_in_ptr, - tmp_kernel_ptr, - static_cast(0), - tmp_out_ptr); - } + Gather(dev_ctx, + x.values().data(), + rulebook_ptr, + rulebook_len, + in_channels, + in_features_ptr); + + // 3. call gemm for every werght + auto blas = phi::funcs::GetBlas(dev_ctx); + auto* out_values = out->mutable_values(); + T* out_values_ptr = out_values->data(); + set_zero(dev_ctx, out_values, static_cast(0.0f)); - // 4. scatter - phi::funcs::sparse::ScatterV2(dev_ctx, - out_features_ptr, - out_index.data(), - unique_value.data(), - out->nnz(), - kernel_size, - out_channels, - 1, - out_values_ptr); + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (h_counter_ptr[i] <= 0) { + continue; + } + + // call gemm: (n, in_channels) * (in_channels, out_channels) + const int M = h_counter_ptr[i]; + const int K = in_channels; + const int N = out_channels; + T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * K * N; + T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels; + + blas.GEMM(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + static_cast(1), + tmp_in_ptr, + tmp_kernel_ptr, + static_cast(0), + tmp_out_ptr); + } + + // 4. scatter + phi::funcs::sparse::ScatterV2(dev_ctx, + out_features_ptr, + out_index.data(), + unique_value.data(), + out->nnz(), + kernel_size, + out_channels, + 1, + out_values_ptr); +#ifdef PADDLE_WITH_CUTLASS + } +#endif } /** diff --git a/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.cu b/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.cu new file mode 100644 index 0000000000000..48727c8f8513d --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.cu @@ -0,0 +1,188 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_CUTLASS +#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h" +namespace phi { +namespace sparse { +fp16_gather_gemm_scatter getBestFp16Kernel(const int M, + const int N, + const int K) { + if (K == 4 && N == 16) { + return launchKernel; + } + if (K == 16 && N == 16) { + return launchKernel; + } + if (K == 16 && N == 32) { + return launchKernel; + } + if (K == 32 && N == 32) { + return launchKernel; + } + if (K == 32 && N == 64) { + return launchKernel; + } + if (K == 64 && N == 64) { + if (M > 100000) + launchKernel< + cutlass::half_t, + cutlass_tensorop_f16_s1688gemm_f16_64x128_32x2_nn_align8::Gemm>; + if (M > 20000) + launchKernel< + cutlass::half_t, + cutlass_tensorop_f16_s1688gemm_f16_64x64_32x2_nn_align8::Gemm>; + if (M > 15000) + return launchKernel< + cutlass::half_t, + cutlass_tensorop_h1688gemm_128x64_32x2_nn_align8::Gemm>; + return launchKernel; + } + if (K == 128) { + if (M >= 5000) + return launchKernel< + cutlass::half_t, + cutlass_tensorop_h1688gemm_64x64_32x2_nn_align8::Gemm>; + return launchKernel; + } + if (N == 128) { + return launchKernel; + } + return launchKernel; +} +fp32_gather_gemm_scatter getBestFp32Kernel(const int M, + const int N, + const int K) { + if (K == 4 && N == 16) { + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; + } + if (K == 16 && N == 16) { + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; + } + if (K == 16 && N == 32) { + if (M >= 10000) + return launchKernel< + float, + cutlass_tensorop_s1688gemm_64x64_16x3_nn_align4::Gemm>; + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; + } + if (K == 32 && N == 32) { + if (M >= 10000) + return launchKernel< + float, + cutlass_tensorop_s1688gemm_64x64_16x3_nn_align4::Gemm>; + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; + } + if (K == 32 && N == 64) { + if (M >= 10000) + return launchKernel< + float, + cutlass_tensorop_s1688gemm_64x64_16x3_nn_align4::Gemm>; + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; + } + if (K == 64 && N == 64) { + if (M >= 15000) + return launchKernel< + float, + cutlass_tensorop_s1688gemm_64x64_16x3_nn_align4::Gemm>; + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; + } + if (K == 128) { + if (M >= 100000) + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_128x128_16x3_nn_align4::Gemm>; + if (M >= 5000) + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_256x64_16x4_nn_align4::Gemm>; + return launchKernel< + float, + cutlass_tensorop_s1688tf32gemm_256x128_16x3_nn_align4::Gemm>; + } + if (N == 128) { + if (M >= 100000) + return launchKernel< + float, + cutlass_tensorop_s1688tf32gemm_256x128_16x3_nn_align4::Gemm>; + if (M >= 5000) + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_128x128_16x3_nn_align4::Gemm>; + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x128_16x6_nn_align4::Gemm>; + } + return launchKernel< + float, + cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4::Gemm>; +} +fp64_gather_gemm_scatter getBestFp64Kernel(const int M, + const int N, + const int K) { + if (K == 4 && N == 16) { + return launchKernel; + } + if (K == 16 && N == 16) { + if (M >= 10000) + return launchKernel; + return launchKernel; + } + if (K == 16 && N == 32) { + return launchKernel; + } + if (K == 32 && N == 32) { + return launchKernel; + } + if (K == 32 && N == 64) { + return launchKernel; + } + if (K == 64 && N == 64) { + return launchKernel; + } + return launchKernel; +} + +} // namespace sparse +} // namespace phi +#endif diff --git a/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h b/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h new file mode 100644 index 0000000000000..462cd71034067 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h @@ -0,0 +1,555 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#ifdef PADDLE_WITH_CUTLASS +#include "cutlass/arch/mma.h" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/device/gemm_grouped.h" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/util/device_memory.h" +#include "examples/common/helper.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +namespace phi { +namespace sparse { +typedef void (*fp16_gather_gemm_scatter)(const GPUContext& dev_ctx, + const cutlass::half_t* const a, + const cutlass::half_t* const b, + const cutlass::half_t* const c, + cutlass::half_t* const d, + const int m, + const int n, + const int k, + const int32_t* a_indices, + const int32_t* c_d_indices, + cutlass::half_t const alpha, + cutlass::half_t const beta); +typedef void (*fp32_gather_gemm_scatter)(const GPUContext& dev_ctx, + const float* const a, + const float* const b, + const float* const c, + float* const d, + const int m, + const int n, + const int k, + const int32_t* a_indices, + const int32_t* c_d_indices, + float const alpha, + float const beta); +typedef void (*fp64_gather_gemm_scatter)(const GPUContext& dev_ctx, + const double* const a, + const double* const b, + const double* const c, + double* const d, + const int m, + const int n, + const int k, + const int32_t* a_indices, + const int32_t* c_d_indices, + double const alpha, + double const beta); +fp16_gather_gemm_scatter getBestFp16Kernel(const int M, + const int K, + const int N); +fp32_gather_gemm_scatter getBestFp32Kernel(const int M, + const int K, + const int N); +fp64_gather_gemm_scatter getBestFp64Kernel(const int M, + const int K, + const int N); +template +void launchKernel(const GPUContext& dev_ctx, + const T* const a, + const T* const b, + const T* const c, + T* const d, + const int m, + const int n, + const int k, + const int32_t* a_indices, + const int32_t* c_d_indices, + T const alpha, + T const beta) { + cutlass::gemm::GemmCoord problem_size_real({m, n, k}); + int split_k_slices = 1; + typename Gemm::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size_real, + split_k_slices, + {alpha, beta}, + a, + b, + c, + d, + cutlass::layout::RowMajor().capacity(problem_size_real.mk()), + cutlass::layout::RowMajor().capacity(problem_size_real.kn()), + cutlass::layout::RowMajor().capacity(problem_size_real.mn()), + cutlass::layout::RowMajor().capacity(problem_size_real.mn()), + problem_size_real.k(), + problem_size_real.n(), + problem_size_real.n(), + problem_size_real.n(), + a_indices, + nullptr, + c_d_indices}; + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + Gemm gemm_op; + cutlass::Status status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(status); + gemm_op(dev_ctx.stream()); +} +struct cutlass_tensorop_h1688gemm_128x64_32x2_nn_align8 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 8, + 8, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_h1688gemm_64x128_32x2_nn_align8 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 8, + 8, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_h1688gemm_128x64_32x2_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 4, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_h1688gemm_64x64_32x2_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 4, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_h1688gemm_64x64_32x2_nn_align8 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 8, + 8, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_h16816gemm_64x64_64x5_nn_align8 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 5, + 8, + 8, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_f16_s1688gemm_f16_64x128_32x2_nn_align8 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread:: + LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 8, + 8, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_f16_s1688gemm_f16_64x64_32x2_nn_align8 { + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread:: + LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 2, + 8, + 8, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_s1688f16gemm_64x64_16x10_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 10, + 4, + 4, + cutlass::arch::OpMultiplyAddFastF16, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_s1688f16gemm_128x128_16x3_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 3, + 4, + 4, + cutlass::arch::OpMultiplyAddFastF16, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_s1688f16gemm_256x64_16x4_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 4, + 4, + 4, + cutlass::arch::OpMultiplyAddFastF16, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_s1688tf32gemm_256x128_16x3_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 3, + 4, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_s1688f16gemm_64x128_16x6_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 6, + 4, + 4, + cutlass::arch::OpMultiplyAddFastF16, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_s1688gemm_64x64_16x3_nn_align4 { + using Gemm = cutlass::gemm::device::GemmUniversal< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 3, + 4, + 4, + cutlass::arch::OpMultiplyAddFastF32, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_d884gemm_16x32_16x5_nn_align1 { + using Gemm = cutlass::gemm::device::GemmUniversal< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::RowMajor, + double, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 5, + 1, + 1, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +struct cutlass_tensorop_d884gemm_32x16_16x5_nn_align1 { + using Gemm = cutlass::gemm::device::GemmUniversal< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::RowMajor, + double, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, + 5, + 1, + 1, + cutlass::arch::OpMultiplyAdd, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + true, + false, + true>; +}; +} // namespace sparse +} // namespace phi +#endif From f82d7e3cb2c24be244704e7aa0f61d4afd9a7be7 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 1 Nov 2022 09:19:25 +0800 Subject: [PATCH 47/91] fix p2p comm memory release logic (#47497) --- .../collective/ProcessGroupNCCL.cc | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 76d1d42c7d653..db713ac304e29 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -453,7 +453,8 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( platform::CUDADeviceGuard cuda_guard; - if (FLAGS_use_stream_safe_cuda_allocator) { + { + platform::NCCLGroupGuard nccl_guard; for (size_t i = 0; i < tensors.size(); ++i) { cuda_guard.SetDevice(places[i]); gpuStream_t nccl_stream; @@ -465,12 +466,11 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( } else { nccl_stream = places_to_ctx_[key][i]->stream(); } - memory::RecordStream(tensors[i].Holder(), nccl_stream); + fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); } } - { - platform::NCCLGroupGuard nccl_guard; + if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < tensors.size(); ++i) { cuda_guard.SetDevice(places[i]); gpuStream_t nccl_stream; @@ -482,7 +482,7 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( } else { nccl_stream = places_to_ctx_[key][i]->stream(); } - fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); + memory::RecordStream(tensors[i].Holder(), nccl_stream); } } @@ -521,20 +521,20 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( // construct uninitialize guard for device platform::CUDADeviceGuard cuda_guard; - if (FLAGS_use_stream_safe_cuda_allocator) { + { + platform::NCCLGroupGuard nccl_guard; for (size_t i = 0; i < tensors.size(); ++i) { cuda_guard.SetDevice(places[i]); - memory::RecordStream(tensors[i].Holder(), - places_to_ctx_[key][i]->stream()); + const auto& nccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); } } - { - platform::NCCLGroupGuard nccl_guard; + if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < tensors.size(); ++i) { cuda_guard.SetDevice(places[i]); - const auto& nccl_stream = places_to_ctx_[key][i]->stream(); - fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); + memory::RecordStream(tensors[i].Holder(), + places_to_ctx_[key][i]->stream()); } } From c923e6c9b1e69cd5cbd0aa76a3982358538c7c4d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 31 Oct 2022 21:47:59 -0500 Subject: [PATCH 48/91] Adapting device-specific Extra Attributes for the PHI kernel (#46342) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add extra attr property set * add type_info for all context * add onednn context to all context * fix context compile error * simplify conv kernel args * pass runtime attr into dev_ctx * fix marco error * clear conv_grad_kernel extra args * merge conv_grad_grad into conv_grad * clear conv2d_grad_grad extra attrs * clear yaml and eager extra attr * fix conv1d error * change to thread local * fix npu compile failed * try to fix windows compile failed * add conv2d onednn phi kernel * fix ci bugs (#36) * fix compile bugs (#38) * fix extra input transform bug (#39) * support dynamic created attr (#40) * reset extra info gen code * rm conv_grad_grad kernel * reimpl pass attr adapting * add int attr support * remove vector inputnames creating * fix map at error * Update paddle/phi/kernels/onednn/conv_grad_kernel.cc Co-authored-by: Sławomir Siwek * remove useless extra attrs * replace mkldnn_engine by onednn_engine Co-authored-by: YuanRisheng Co-authored-by: Sławomir Siwek --- .../manual/eager_manual/dygraph_forward_api.h | 9 +- .../forwards/conv2d_fwd_function.cc | 41 +- .../manual/eager_manual/nodes/conv2d_nodes.cc | 29 +- .../api/manual/eager_manual/nodes/nodes.h | 32 +- paddle/fluid/framework/archive.h | 2 +- paddle/fluid/framework/channel.h | 2 +- paddle/fluid/framework/operator.cc | 147 +++- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 2 +- .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 16 +- .../fluid/operators/mkldnn/prelu_mkldnn_op.cc | 2 +- .../operators/mkldnn/test_mkldnn_caching.cc | 2 +- paddle/fluid/operators/ops_extra_info.h | 141 +++ .../platform/device/mlu/device_context.h | 6 +- paddle/fluid/platform/device_context.cc | 2 +- paddle/fluid/platform/device_context.h | 24 +- paddle/fluid/platform/enforce.h | 133 +-- paddle/fluid/platform/enforce_test.cc | 7 +- paddle/phi/api/yaml/legacy_backward.yaml | 26 +- paddle/phi/api/yaml/legacy_ops.yaml | 8 +- paddle/phi/backends/all_context.h | 3 +- paddle/phi/backends/cpu/cpu_context.h | 5 +- paddle/phi/backends/custom/custom_context.h | 5 +- paddle/phi/backends/gpu/gpu_context.cc | 36 + paddle/phi/backends/gpu/gpu_context.h | 15 +- paddle/phi/backends/onednn/onednn_context.cc | 137 ++- paddle/phi/backends/onednn/onednn_context.h | 23 + paddle/phi/backends/onednn/onednn_helper.h | 35 + paddle/phi/backends/onednn/onednn_reuse.h | 62 ++ paddle/phi/backends/xpu/xpu_context.h | 9 +- paddle/phi/core/attribute.h | 2 +- paddle/phi/core/device_context.h | 13 + paddle/phi/core/enforce.h | 132 ++- paddle/{fluid/framework => phi/core}/expect.h | 0 paddle/phi/core/kernel_context.h | 4 - paddle/phi/core/kernel_registry.h | 2 - paddle/phi/core/kernel_utils.h | 23 +- paddle/phi/infermeta/binary.cc | 61 +- paddle/phi/infermeta/binary.h | 44 +- paddle/phi/kernels/conv_grad_grad_kernel.h | 61 -- paddle/phi/kernels/conv_grad_kernel.h | 48 +- paddle/phi/kernels/conv_kernel.cc | 56 -- paddle/phi/kernels/conv_kernel.h | 19 +- .../phi/kernels/cpu/conv_grad_grad_kernel.cc | 72 -- paddle/phi/kernels/cpu/conv_grad_kernel.cc | 65 +- paddle/phi/kernels/cpu/conv_kernel.cc | 70 +- paddle/phi/kernels/cpu/erfinv_grad_kernel.cc | 4 + paddle/phi/kernels/cpu/erfinv_kernel.cc | 22 +- .../phi/kernels/gpu/conv_grad_grad_kernel.cu | 23 - paddle/phi/kernels/gpu/conv_grad_kernel.cu | 13 +- paddle/phi/kernels/gpu/conv_kernel.cu | 46 +- paddle/phi/kernels/gpu/erfinv_grad_kernel.cu | 4 + .../kernels/gpudnn/conv_grad_grad_kernel.cu | 824 ------------------ paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 811 ++++++++++++++++- paddle/phi/kernels/gpudnn/conv_kernel.cu | 25 +- .../kernels/impl/conv_grad_grad_kernel_impl.h | 330 ------- .../phi/kernels/impl/conv_grad_kernel_impl.h | 306 ++++++- paddle/phi/kernels/impl/conv_kernel_impl.h | 23 +- .../kernels/impl/erfinv_grad_kernel_impl.h | 3 - paddle/phi/kernels/impl/erfinv_kernel_impl.h | 35 - paddle/phi/kernels/onednn/conv_grad_kernel.cc | 192 ++++ paddle/phi/kernels/onednn/conv_handler.h | 763 ++++++++++++++++ paddle/phi/kernels/onednn/conv_kernel.cc | 436 +++++++++ paddle/phi/kernels/xpu/conv_grad_kernel.cc | 14 +- paddle/phi/kernels/xpu/conv_kernel.cc | 14 +- paddle/phi/ops/compat/conv2d_sig.cc | 48 +- python/paddle/fluid/dygraph/nn.py | 5 +- python/paddle/nn/functional/conv.py | 52 +- 68 files changed, 3673 insertions(+), 1955 deletions(-) rename paddle/{fluid/framework => phi/core}/expect.h (100%) delete mode 100644 paddle/phi/kernels/conv_grad_grad_kernel.h delete mode 100644 paddle/phi/kernels/conv_kernel.cc delete mode 100644 paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc delete mode 100644 paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu delete mode 100644 paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu delete mode 100644 paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h delete mode 100644 paddle/phi/kernels/impl/erfinv_kernel_impl.h create mode 100644 paddle/phi/kernels/onednn/conv_grad_kernel.cc create mode 100644 paddle/phi/kernels/onednn/conv_handler.h create mode 100644 paddle/phi/kernels/onednn/conv_kernel.cc diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h index bc970f4e2d859..22a4b03312ce0 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h @@ -24,10 +24,7 @@ paddle::experimental::Tensor conv2d_ad_func( const paddle::experimental::Tensor& filter, std::vector strides, std::vector paddings, - std::string paddding_algorithm, - int groups, + std::string padding_algorithm, std::vector dilations, - std::string data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search); + int groups, + std::string data_format); diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index d5f15883e0e19..842ff4822dba4 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -29,13 +29,10 @@ paddle::experimental::Tensor conv2d_ad_func( const paddle::experimental::Tensor& filter, std::vector strides, std::vector paddings, - std::string paddding_algorithm, - int groups, + std::string padding_algorithm, std::vector dilations, - std::string data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search) { + int groups, + std::string data_format) { // Dygraph Record Event paddle::platform::RecordEvent dygraph_entrance_record_event( "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1); @@ -64,13 +61,10 @@ paddle::experimental::Tensor conv2d_ad_func( new_filter, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search); + groups, + data_format); } } @@ -92,13 +86,10 @@ paddle::experimental::Tensor conv2d_ad_func( filter, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search); + groups, + data_format); transformer->SetOutTensorLayout(&out); if (need_tune) { egr::Controller::Instance().EnableLayoutAutoTune(); @@ -119,13 +110,10 @@ paddle::experimental::Tensor conv2d_ad_func( filter, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search); + groups, + data_format); // Check NaN and Inf if needed if (FLAGS_check_nan_inf) { egr::CheckTensorHasNanOrInf("conv2d", api_result); @@ -157,13 +145,10 @@ paddle::experimental::Tensor conv2d_ad_func( // SetAttributes if needed grad_node->SetAttributestrides(strides); grad_node->SetAttributepaddings(paddings); - grad_node->SetAttributepaddding_algorithm(paddding_algorithm); + grad_node->SetAttributepadding_algorithm(padding_algorithm); grad_node->SetAttributegroups(groups); grad_node->SetAttributedilations(dilations); grad_node->SetAttributedata_format(data_format); - grad_node->SetAttributeuse_addto(use_addto); - grad_node->SetAttributeworkspace_size_MB(workspace_size_MB); - grad_node->SetAttributeexhaustive_search(exhaustive_search); // Set TensorWrappers for Forward Inputs if needed grad_node->SetTensorWrapperinput(input); grad_node->SetTensorWrapperfilter(filter); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc index ce8d647cb9ece..647f6768bc6b1 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -46,13 +46,10 @@ Conv2dGradNodeFinal::operator()( auto& grad_out = hooked_grads[0][0]; auto& strides = this->strides_; auto& paddings = this->paddings_; - auto& paddding_algorithm = this->paddding_algorithm_; + auto& padding_algorithm = this->padding_algorithm_; auto& groups = this->groups_; auto& dilations = this->dilations_; auto& data_format = this->data_format_; - auto& use_addto = this->use_addto_; - auto& workspace_size_MB = this->workspace_size_MB_; - auto& exhaustive_search = this->exhaustive_search_; // Prepare Grad function call const auto& out_metas = OutputMeta(); @@ -87,13 +84,10 @@ Conv2dGradNodeFinal::operator()( grad_out, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, api_output_0, api_output_1); // Check NaN and Inf id needed @@ -134,13 +128,10 @@ Conv2dGradNodeFinal::operator()( // SetAttributes if needed grad_node->SetAttributestrides(strides); grad_node->SetAttributepaddings(paddings); - grad_node->SetAttributepaddding_algorithm(paddding_algorithm); + grad_node->SetAttributepadding_algorithm(padding_algorithm); grad_node->SetAttributegroups(groups); grad_node->SetAttributedilations(dilations); grad_node->SetAttributedata_format(data_format); - grad_node->SetAttributeuse_addto(use_addto); - grad_node->SetAttributeworkspace_size_MB(workspace_size_MB); - grad_node->SetAttributeexhaustive_search(exhaustive_search); // Set TensorWrappers for Forward Inputs if needed grad_node->SetTensorWrapperinput(input); grad_node->SetTensorWrapperfilter(filter); @@ -215,13 +206,10 @@ Conv2dDoubleGradNodeFinal::operator()( auto& strides = this->strides_; auto& paddings = this->paddings_; - auto& paddding_algorithm = this->paddding_algorithm_; + auto& padding_algorithm = this->padding_algorithm_; auto& groups = this->groups_; auto& dilations = this->dilations_; auto& data_format = this->data_format_; - auto& use_addto = this->use_addto_; - auto& workspace_size_MB = this->workspace_size_MB_; - auto& exhaustive_search = this->exhaustive_search_; // Prepare Grad function call const auto& out_metas = OutputMeta(); @@ -261,13 +249,10 @@ Conv2dDoubleGradNodeFinal::operator()( grad_filter_grad_optional, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, api_output_0, api_output_1, api_output_2); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h index 14fe144c0094a..6fb583703848a 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h @@ -63,8 +63,8 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { void SetAttributepaddings(const std::vector& paddings) { paddings_ = paddings; } - void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) { - paddding_algorithm_ = paddding_algorithm; + void SetAttributepadding_algorithm(const std::string& padding_algorithm) { + padding_algorithm_ = padding_algorithm; } void SetAttributegroups(const int& groups) { groups_ = groups; } void SetAttributedilations(const std::vector& dilations) { @@ -73,13 +73,6 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { void SetAttributedata_format(const std::string& data_format) { data_format_ = data_format; } - void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; } - void SetAttributeworkspace_size_MB(const int& workspace_size_MB) { - workspace_size_MB_ = workspace_size_MB; - } - void SetAttributeexhaustive_search(const bool& exhaustive_search) { - exhaustive_search_ = exhaustive_search; - } private: // TensorWrappers @@ -89,13 +82,10 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { // Attributes std::vector strides_; std::vector paddings_; - std::string paddding_algorithm_; + std::string padding_algorithm_; int groups_; std::vector dilations_; std::string data_format_; - bool use_addto_; - int workspace_size_MB_; - bool exhaustive_search_; }; class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { @@ -146,8 +136,8 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { void SetAttributepaddings(const std::vector& paddings) { paddings_ = paddings; } - void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) { - paddding_algorithm_ = paddding_algorithm; + void SetAttributepadding_algorithm(const std::string& padding_algorithm) { + padding_algorithm_ = padding_algorithm; } void SetAttributegroups(const int& groups) { groups_ = groups; } void SetAttributedilations(const std::vector& dilations) { @@ -156,13 +146,6 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { void SetAttributedata_format(const std::string& data_format) { data_format_ = data_format; } - void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; } - void SetAttributeworkspace_size_MB(const int& workspace_size_MB) { - workspace_size_MB_ = workspace_size_MB; - } - void SetAttributeexhaustive_search(const bool& exhaustive_search) { - exhaustive_search_ = exhaustive_search; - } private: // TensorWrappers @@ -173,13 +156,10 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { // Attributes std::vector strides_; std::vector paddings_; - std::string paddding_algorithm_; + std::string padding_algorithm_; int groups_; std::vector dilations_; std::string data_format_; - bool use_addto_; - int workspace_size_MB_; - bool exhaustive_search_; }; class AddNGradNodeFinal : public egr::GradNodeBase { diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h index a68470492e936..5e7e510b06106 100644 --- a/paddle/fluid/framework/archive.h +++ b/paddle/fluid/framework/archive.h @@ -32,8 +32,8 @@ #include #include -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index fe6c51b87228a..f498b1db2de70 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -30,7 +30,7 @@ #include #include -#include "paddle/fluid/framework/expect.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f3c2f6c4b2eb6..b4714407686d8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/isfinite_op.h" +#include "paddle/fluid/operators/ops_extra_info.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -2269,7 +2270,8 @@ Scope* OperatorWithKernel::PrepareData( } std::unique_ptr new_expected_kernel_key = nullptr; - if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) { + if (run_phi_kernel_ && in_def != nullptr && + in_def->backend != phi::Backend::ALL_BACKEND) { auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); if ((in_def->backend != tensor_backend && (in_def->backend != phi::Backend::GPUDNN || @@ -2388,7 +2390,6 @@ Scope* OperatorWithKernel::PrepareData( input_names.size(), input_defs.size())); for (size_t i = 0; i < input_defs.size(); ++i) { - const auto& input_defs = phi_kernel_->args_def().input_defs(); auto& in_def = input_defs.at(i); std::string input_name = input_names[i]; auto iter = ctx->inputs.find(input_name); @@ -2400,6 +2401,22 @@ Scope* OperatorWithKernel::PrepareData( no_buffer_ins && no_buffer_ins->count(input_name) > 0; prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input); } +#ifdef PADDLE_WITH_MKLDNN + // For input that is Extra, only MKLDNN will use Extra Inputs + auto& extra_input_names = + paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap( + Type()); + for (const auto& input_name : extra_input_names) { + auto iter = ctx->inputs.find(input_name); + if (iter == ctx->inputs.end()) { + continue; + } + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(input_name) > 0; + std::vector& input_vars = iter->second; + prepare_input_data(input_name, &input_vars, nullptr, should_skip_input); + } +#endif } else { for (auto& var_name_item : Inputs()) { bool should_skip_input = @@ -2699,6 +2716,65 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( return (*arg_map_fn_)(arg_mapping_ctx); } +static void SetDnnAttrIntoDeviceContext( + phi::DeviceContext* dev_ctx, + const Attribute& attr, + const std::string& attr_name, + const operators::ExtraAttrPropertySet& attr_propertys) { +#ifdef PADDLE_WITH_MKLDNN + if (phi::OneDNNContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::ONEDNN)) { + VLOG(4) << "Runtime attr `" << attr_name << "` is passed to OneDNNContext."; + phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); + switch (AttrTypeID(attr)) { + case proto::AttrType::FLOAT: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); + break; + case proto::AttrType::INT: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); + break; + case proto::AttrType::STRING: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(std::string, attr)); + break; + case proto::AttrType::INTS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::FLOATS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::BOOLEAN: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for phi.", + platform::demangle(attr.type().name()))); + } + } +#endif +#ifdef PADDLE_WITH_CUDA + if (phi::GPUContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) { + VLOG(4) << "Runtime attr `" << attr_name << "` is passed to GPUDNNContext."; + phi::GPUContext* gpu_dnn_ctx = static_cast(dev_ctx); + switch (AttrTypeID(attr)) { + case proto::AttrType::INT: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); + break; + case proto::AttrType::BOOLEAN: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for phi.", + platform::demangle(attr.type().name()))); + } + } +#endif +} + void OperatorWithKernel::BuildPhiKernelContext( const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, @@ -2713,6 +2789,15 @@ void OperatorWithKernel::BuildPhiKernelContext( auto attr_defs = phi_kernel_->args_def().attribute_defs(); auto output_defs = phi_kernel_->args_def().output_defs(); +#if defined(PADDLE_WITH_MKLDNN) + if (phi::OneDNNContext::classof(dev_ctx)) { + // Onednn holds this op's variable's name and init them here. + phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); + one_dnn_ctx->SetInputsName(Inputs()); + one_dnn_ctx->SetOutputsName(Outputs()); + } +#endif + PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), platform::errors::InvalidArgument( @@ -2992,6 +3077,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } break; default: { if (attr_iter == Attrs().end()) { + // TODO(chenweihang): remove this backup searching later attr_iter = RuntimeAttrs().find(attr_names[i]); PADDLE_ENFORCE_NE(attr_iter, RuntimeAttrs().end(), @@ -3075,6 +3161,63 @@ void OperatorWithKernel::BuildPhiKernelContext( } } VLOG(4) << "Done attributes"; + + // For compatible with Op with extra attrs for specific backend +#if defined(PADDLE_WITH_MKLDNN) || defined(PADDLE_WITH_CUDA) + auto& runtime_attrs = RuntimeAttrs(); + for (const auto& attr_iter : runtime_attrs) { + auto& attr_name = attr_iter.first; + auto& attr = attr_iter.second; + auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name); + SetDnnAttrIntoDeviceContext(dev_ctx, attr, attr_name, attr_propertys); + } + // TODO(chenweihang): Since the pass will still `SetAttr` in the OpDesc, + // we try to add these Attrs to the RuntimeAttrs, but these OpDesc will lose + // the RuntimeAttrs information in the process of converting the Graph to + // the Program, so additional record configuration will be introduced, + // which increases the The cost of development and understanding, so we + // still use Attrs to get and the attributes set by these passes from Attrs + // for the time being. In the future, it is necessary to clarify the + // positioning of RuntimeAttrs and expand related functions. + auto& attrs = Attrs(); + for (const auto& attr_iter : attrs) { + auto& attr_name = attr_iter.first; + auto& attr = attr_iter.second; + auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name); + SetDnnAttrIntoDeviceContext(dev_ctx, attr, attr_name, attr_propertys); + } + VLOG(4) << "Done runtime attributes"; +#endif + +// For compatible with Op with extra input for onednn backend +#ifdef PADDLE_WITH_MKLDNN + if (phi::OneDNNContext::classof(dev_ctx)) { + phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); + auto& extra_input_names = + paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap( + Type()); + for (const auto& input_name : extra_input_names) { + auto it = ctx.inputs.find(input_name); + if (it == ctx.inputs.end() || it->second.size() == 0) { + one_dnn_ctx->SetDnnInput(input_name, nullptr); + } else { + auto ins_vector = it->second; + PADDLE_ENFORCE_EQ( + ins_vector.size(), + 1UL, + phi::errors::InvalidArgument( + "OneDNN's extra input only allows one input tensor.")); + auto* var = ins_vector[0]; + PADDLE_ENFORCE_EQ(var->IsType(), + true, + phi::errors::InvalidArgument( + "OneDNN's extra input only can be DenseTensor.")); + one_dnn_ctx->SetDnnInput(input_name, &(var->Get())); + } + } + } + VLOG(4) << "Done runtime extra inputs"; +#endif } } // namespace framework diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 13b62b5a47244..b49c0cafffc65 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/operators/fused/fusion_gru_op.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index fd113489238f7..a9464e947cf0a 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 60673bbc72534..66e8ba222c0d8 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -14,11 +14,11 @@ #include -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/core/expect.h" #include "paddle/phi/core/visit_type.h" @@ -1184,20 +1184,6 @@ class ConvMKLDNNGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_KERNEL(conv2d, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel); - -REGISTER_OP_KERNEL(conv2d_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::ConvMKLDNNGradOpKernel, - ops::ConvMKLDNNGradOpKernel); - REGISTER_OP_KERNEL(depthwise_conv2d, MKLDNN, ::paddle::platform::CPUPlace, diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc index e3b9d3ffd7c6a..246bfacca7f4e 100644 --- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 60c9b8f265960..9a4237230e86f 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(conv2d); -USE_OP_DEVICE_KERNEL(conv2d, MKLDNN); +PD_DECLARE_KERNEL(conv2d, OneDNN, ONEDNN); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 8f3780dd3a3ad..d0847df43e230 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -14,11 +14,137 @@ #pragma once +#include +#include +#include + #include "paddle/fluid/framework/attribute.h" namespace paddle { namespace operators { +// This file is to be compatible with the bad design and +// implementation of fluid in the past + +// Many operators in fluid have extra attributes, which are generally added +// to implement some specific kernel selection and to meet the specialization +// needs of a specific operation library like mkldnn or cudnn +enum class ExtraAttrProperty : uint8_t { + // The attributes that are no longer used by any scene + DEPRECATED = 0, + // The attributes used for framework execution scheduling, + // such as `use_mkldnn`, `use_cudnn`, no need to save + SCHEDULE, + // The attributes for ONEDNN only, can be saved in OneDNNContext + ONEDNN, + // The attributes for ONEDNN only, can be saved in GPUContext + GPUDNN, + // Add necessary properties as needed +}; + +class ExtraAttrPropertySet final { + public: + constexpr ExtraAttrPropertySet() : bitset_(0) {} + constexpr ExtraAttrPropertySet(ExtraAttrProperty e) // NOLINT + : bitset_(e == ExtraAttrProperty::DEPRECATED + ? 0 + : 1ULL << (static_cast(e) - 1)) {} + + inline uint64_t bitset() const { return bitset_; } + + bool inline Support(ExtraAttrProperty e) const { + // DEPRECATED ExtraAttr always return false + return static_cast(bitset_ & ExtraAttrPropertySet(e).bitset()); + } + bool IsEmpty() const { return bitset_ == 0; } + + ExtraAttrPropertySet operator|(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ | other.bitset()); + } + ExtraAttrPropertySet operator&(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ & other.bitset()); + } + ExtraAttrPropertySet operator-(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ & ~other.bitset()); + } + ExtraAttrPropertySet operator^(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ ^ other.bitset()); + } + + bool operator==(const ExtraAttrPropertySet& other) const { + return bitset_ == other.bitset(); + } + + private: + constexpr ExtraAttrPropertySet(uint64_t bitset) : bitset_(bitset) {} + uint64_t bitset_; +}; + +const std::unordered_map + extra_attr_properties = { + // DEPRECATED attributes + {"use_quantizer", ExtraAttrProperty::DEPRECATED}, + // SCHEDULE attributes + {"use_cudnn", ExtraAttrProperty::SCHEDULE}, + {"use_mkldnn", ExtraAttrProperty::SCHEDULE}, + // ONEDNN dedicated attributes + {"Bias", ExtraAttrProperty::ONEDNN}, + {"data_format", ExtraAttrProperty::ONEDNN}, + {"force_fp32_output", ExtraAttrProperty::ONEDNN}, + {"fuse_activation", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_type", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_alpha", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_beta", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_scale", ExtraAttrProperty::ONEDNN}, + {"fuse_alpha", ExtraAttrProperty::ONEDNN}, + {"fuse_beta", ExtraAttrProperty::ONEDNN}, + {"fuse_relu", ExtraAttrProperty::ONEDNN}, + {"fuse_residual_connection", ExtraAttrProperty::ONEDNN}, + {"fuse_with_relu", ExtraAttrProperty::ONEDNN}, + {"fused_reshape_Out", ExtraAttrProperty::ONEDNN}, + {"fused_transpose_Out", ExtraAttrProperty::ONEDNN}, + {"fused_reshape_X", ExtraAttrProperty::ONEDNN}, + {"fused_reshape_Y", ExtraAttrProperty::ONEDNN}, + {"fused_transpose_X", ExtraAttrProperty::ONEDNN}, + {"fused_transpose_Y", ExtraAttrProperty::ONEDNN}, + {"mkldnn_data_type", ExtraAttrProperty::ONEDNN}, + {"ResidualData", ExtraAttrProperty::ONEDNN}, + {"scale_x", ExtraAttrProperty::ONEDNN}, + {"scale_y", ExtraAttrProperty::ONEDNN}, + {"scale_out", ExtraAttrProperty::ONEDNN}, + {"Scale_in", ExtraAttrProperty::ONEDNN}, + {"Scale_in_eltwise", ExtraAttrProperty::ONEDNN}, + {"Scale_x", ExtraAttrProperty::ONEDNN}, + {"Scale_y", ExtraAttrProperty::ONEDNN}, + {"Scale_out", ExtraAttrProperty::ONEDNN}, + {"Scale_weights", ExtraAttrProperty::ONEDNN}, + {"x_data_format", ExtraAttrProperty::ONEDNN}, + {"y_data_format", ExtraAttrProperty::ONEDNN}, + // ONEDNN pass dedicated attributes + {"Activation_scale", ExtraAttrProperty::ONEDNN}, + {"Bias_scales", ExtraAttrProperty::ONEDNN}, + {"Output_shift_scale", ExtraAttrProperty::ONEDNN}, + {"Sum_scale", ExtraAttrProperty::ONEDNN}, + // GPUDNN dedicated attributes + {"exhaustive_search", ExtraAttrProperty::GPUDNN}, + {"fuse_relu_before_depthwise_conv", ExtraAttrProperty::GPUDNN}, + {"use_addto", ExtraAttrProperty::GPUDNN}, + {"workspace_size_MB", ExtraAttrProperty::GPUDNN}, + // Mixed-use attributes + {"is_test", + ExtraAttrPropertySet(ExtraAttrProperty::ONEDNN) | + ExtraAttrPropertySet(ExtraAttrProperty::GPUDNN)}, +}; + +inline ExtraAttrPropertySet GetExtraAttrPropertys( + const std::string& attr_name) { + auto iter = extra_attr_properties.find(attr_name); + if (iter != extra_attr_properties.end()) { + return iter->second; + } + return ExtraAttrPropertySet(); +} + template struct ExtraAttrChecker { ExtraAttrChecker(const std::string& attr_name, T default_value) @@ -71,6 +197,15 @@ class ExtraInfoUtils { return empty_extra_attrs_checker_; } + const std::vector& GetExtraInputNamesMap( + const std::string& op_type) const { + auto iter = g_extra_input_names_map_.find(op_type); + if (iter != g_extra_input_names_map_.end()) { + return iter->second; + } + return empty_extra_input_names_; + } + private: ExtraInfoUtils(); @@ -83,6 +218,12 @@ class ExtraInfoUtils { g_extra_attrs_checker_; std::vector> empty_extra_attrs_checker_{}; + + // TODO(chenweihang): move these extra inputs into op_compat.yaml + std::unordered_map> + g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}}, + {"conv2d_grad", {"Bias"}}}; + std::vector empty_extra_input_names_; }; } // namespace operators diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h index e1028667bc207..a430e18a34abf 100644 --- a/paddle/fluid/platform/device/mlu/device_context.h +++ b/paddle/fluid/platform/device/mlu/device_context.h @@ -89,7 +89,9 @@ class MLUContext { DISABLE_COPY_AND_ASSIGN(MLUContext); }; -class MLUDeviceContext : public DeviceContext { +class MLUDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: explicit MLUDeviceContext(MLUPlace place); virtual ~MLUDeviceContext(); @@ -148,6 +150,8 @@ class MLUDeviceContext : public DeviceContext { return thread_ctx_.at(this); } + static const char* name() { return "MLUDeviceContext"; } + private: int compute_capability_; int driver_version_; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c39705b618f65..f0b1efc769430 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device/device_wrapper.h" @@ -28,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/expect.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index f0119d1f839cb..0ad10cd8a7b3a 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -144,7 +144,9 @@ struct DefaultDeviceContextType { // Graphcore IPU #ifdef PADDLE_WITH_IPU -class IPUDeviceContext : public DeviceContext { +class IPUDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: IPUDeviceContext() = delete; explicit IPUDeviceContext(IPUPlace place); @@ -154,6 +156,8 @@ class IPUDeviceContext : public DeviceContext { /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; + static const char* name() { return "IPUDeviceContext"; } + private: IPUPlace place_; }; @@ -188,7 +192,9 @@ struct DefaultDeviceContextType { #endif #ifdef PADDLE_WITH_ASCEND_CL -class NPUDeviceContext : public DeviceContext { +class NPUDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: explicit NPUDeviceContext(NPUPlace place); virtual ~NPUDeviceContext(); @@ -224,6 +230,8 @@ class NPUDeviceContext : public DeviceContext { // void WaitStreamCallback() const { return stream_->WaitCallback(); } + static const char* name() { return "NPUDeviceContext"; } + private: NPUPlace place_; aclrtContext context_; @@ -248,7 +256,9 @@ struct DefaultDeviceContextType { }; // Currently, NPUPinnedDeviceContext is only used to data copying. -class NPUPinnedDeviceContext : public DeviceContext { +class NPUPinnedDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: NPUPinnedDeviceContext(); explicit NPUPinnedDeviceContext(NPUPinnedPlace place); @@ -257,6 +267,8 @@ class NPUPinnedDeviceContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; + static const char* name() { return "NPUPinnedDeviceContext"; } + private: NPUPinnedPlace place_; std::unique_ptr eigen_device_; @@ -276,7 +288,9 @@ struct DefaultDeviceContextType { }; // Currently, CUDAPinnedDeviceContext is only used to data copying. -class CUDAPinnedDeviceContext : public DeviceContext { +class CUDAPinnedDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: CUDAPinnedDeviceContext(); explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); @@ -285,6 +299,8 @@ class CUDAPinnedDeviceContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; + static const char* name() { return "CUDAPinnedDeviceContext"; } + private: CUDAPinnedPlace place_; std::unique_ptr eigen_device_; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3cad2e3d2055e..5bd9029179ffe 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -122,77 +122,80 @@ using namespace ::phi::enforce; // NOLINT #endif /* - * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get - * safely. paddle::get is not a completely safe api, although it will not - * go wrong in most cases, but in extreme cases, it may fail and directly - * throw a paddle::bad_variant_access const exception, without any stack - *information. - * This kind of problems is difficult to debug, so add these macros to - * enrich paddle::get error information. At the same time, we restrict - * the direct use of paddle::get by CI rule. + * Summary: This macro is used to get Variable or internal type + * data (such as LoDTensor or SelectedRows) of the Input and + * Output in op, generally used when call scope.FindVar(Input/ + * Output("Name")) or ctx.Input(). + * Firstly this macro check whether the obtained pointer is null, + * and then return data if it is not null. + * + * Note: This macro is only suitable for specific scenarios and + * does not intended to be widely used. If it cannot meet the + * requirements, please use other PADDLE_ENFORCE** check macro. * * Parameters: - *     __TYPE: the target variable type - * __VALUE: the target variable to get + *     __PTR: pointer + * __ROLE: (string), Input or Output + * __NAME: (string), Input or Output name + * __OP_TYPE: (string), the op type * - * Examples: - * - unsafe writing: int x = paddle::get(y); - * - safe writing: int x = PADDLE_GET(int, y); + * Return: The data pointed to by the pointer. * - * Note: GCC 4.8 cannot select right overloaded function here, so need - * to define different functions and macros here, after we upgreade - * CI gcc version, we can only define one PADDLE_GET macro. + * Examples: + * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); */ -namespace details { - -using namespace phi::enforce::details; // NOLINT - -#define DEFINE_SAFE_PADDLE_GET( \ - __InputType, __OutputType, __OutputTypePtr, __FuncName) \ - template \ - auto __FuncName( \ - __InputType input, const char* expression, const char* file, int line) \ - ->typename std::conditional::value, \ - __OutputTypePtr, \ - __OutputType>::type { \ - try { \ - return paddle::get(input); \ - } catch (paddle::bad_variant_access const&) { \ - HANDLE_THE_ERROR \ - throw ::phi::enforce::EnforceNotMet( \ - phi::errors::InvalidArgument( \ - "paddle::get failed, cannot get value " \ - "(%s) by type %s, its type is %s.", \ - expression, \ - phi::enforce::demangle(typeid(OutputType).name()), \ - phi::enforce::demangle(input.type().name())), \ - file, \ - line); \ - END_HANDLE_THE_ERROR \ - } \ - } - -DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet); -DEFINE_SAFE_PADDLE_GET(const InputType&, - const OutputType&, - const OutputType*, - SafeBoostGetConst); -DEFINE_SAFE_PADDLE_GET(InputType&&, - OutputType, - OutputType*, - SafeBoostGetMutable); - -} // namespace details +#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ + (([&]() -> std::add_lvalue_reference::type { \ + auto* __ptr = (__PTR); \ + if (UNLIKELY(nullptr == __ptr)) { \ + auto __summary__ = phi::errors::NotFound( \ + "Unable to get %s data of %s %s in operator %s. " \ + "Possible reasons are:\n" \ + " 1. The %s is not the %s of operator %s;\n" \ + " 2. The %s has no corresponding variable passed in;\n" \ + " 3. The %s corresponding variable is not initialized.", \ + phi::demangle( \ + typeid(std::add_lvalue_reference::type) \ + .name()), \ + __ROLE, \ + __NAME, \ + __OP_TYPE, \ + __NAME, \ + __ROLE, \ + __OP_TYPE, \ + __NAME, \ + __NAME); \ + auto __message__ = ::paddle::string::Sprintf( \ + "%s\n [Hint: pointer " #__PTR " should not be null.]", \ + __summary__.error_message()); \ + __THROW_ERROR_INTERNAL__( \ + phi::ErrorSummary(__summary__.code(), __message__)); \ + } \ + return *__ptr; \ + })()) -#define PADDLE_GET(__TYPE, __VALUE) \ - paddle::platform::details::SafeBoostGet<__TYPE>( \ - __VALUE, #__VALUE, __FILE__, __LINE__) -#define PADDLE_GET_CONST(__TYPE, __VALUE) \ - paddle::platform::details::SafeBoostGetConst<__TYPE>( \ - __VALUE, #__VALUE, __FILE__, __LINE__) -#define PADDLE_GET_MUTABLE(__TYPE, __VALUE) \ - paddle::platform::details::SafeBoostGetMutable<__TYPE>( \ - __VALUE, #__VALUE, __FILE__, __LINE__) +/* + * Summary: This macro is used to check whether op has specified + * Input or Output Variables. Because op's Input and Output + * checking are written similarly, so abstract this macro. + * + * Parameters: + *     __EXPR: (bool), the bool expression + * __ROLE: (string), Input or Output + * __NAME: (string), Input or Output name + * __OP_TYPE: (string), the op type + * + * Examples: + * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); + */ +#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ + do { \ + PADDLE_ENFORCE_EQ( \ + __EXPR, \ + true, \ + phi::errors::NotFound( \ + "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \ + } while (0) /** OTHER EXCEPTION AND ENFORCE **/ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 48aece1c4170b..9fc200ca82f1c 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -528,10 +528,9 @@ struct CannotToStringType { }; TEST(enforce, cannot_to_string_type) { - static_assert( - !paddle::platform::details::CanToString::kValue, - "CannotToStringType must not be converted to string"); - static_assert(paddle::platform::details::CanToString::kValue, + static_assert(!phi::enforce::details::CanToString::kValue, + "CannotToStringType must not be converted to string"); + static_assert(phi::enforce::details::CanToString::kValue, "int can be converted to string"); CannotToStringType obj1(3), obj2(4), obj3(3); diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 0eb91e9b51d43..7d9da75c23209 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -312,8 +312,8 @@ func : conj - backward_op : conv2d_grad - forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -324,8 +324,8 @@ backward : conv2d_grad_grad - backward_op : conv2d_grad_grad - forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -357,8 +357,8 @@ backward : conv2d_transpose_double_grad - backward_op : conv3d_grad - forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -369,8 +369,8 @@ backward : conv3d_grad_grad - backward_op : conv3d_grad_grad - forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -439,21 +439,21 @@ optional : mask - backward_op : depthwise_conv2d_grad - forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) + forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta param : [input, filter] kernel : func : depthwise_conv2d_grad - param : [input, filter, out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu] + param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu] use_gpudnn : use_gpudnn backward : depthwise_conv2d_grad_grad - backward_op : depthwise_conv2d_grad_grad - forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) + forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index c711f6bd42710..e6aa3b18f5f86 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -454,7 +454,7 @@ backward : conj_grad - op : conv2d - args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor infer_meta : func : ConvInferMeta @@ -474,10 +474,10 @@ backward : conv2d_transpose_grad - op : conv3d - args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor infer_meta : - func : ConvInferMeta + func : Conv3DInferMeta kernel : func : conv3d use_gpudnn : true @@ -564,7 +564,7 @@ args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) output : Tensor(out) infer_meta : - func : ConvInferMeta + func : DepthwiseConvInferMeta param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search] kernel : func : depthwise_conv2d diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h index 392df09fcffd8..12cb45d23812f 100644 --- a/paddle/phi/backends/all_context.h +++ b/paddle/phi/backends/all_context.h @@ -23,9 +23,8 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/xpu/xpu_context.h" -#endif #ifndef PADDLE_WITH_CUSTOM_KERNEL // TODO(wilber): DeviceContextPool nees include fluid file. diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h index 58548b2e04e02..3a8b5b0a82adc 100644 --- a/paddle/phi/backends/cpu/cpu_context.h +++ b/paddle/phi/backends/cpu/cpu_context.h @@ -24,7 +24,8 @@ limitations under the License. */ namespace phi { -class PADDLE_API CPUContext : public DeviceContext { +class PADDLE_API CPUContext : public DeviceContext, + public TypeInfoTraits { public: CPUContext(); CPUContext(CPUContext&&); @@ -34,6 +35,8 @@ class PADDLE_API CPUContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; const Place& GetPlace() const override; + static const char* name() { return "CPUContext"; } + protected: // NOTE: External users manage resources. Used in inference scenarios. // The Set interface is for inference only, DeviceContext will mark the diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h index 57be8534fa954..d007cb62cd4f9 100644 --- a/paddle/phi/backends/custom/custom_context.h +++ b/paddle/phi/backends/custom/custom_context.h @@ -21,7 +21,8 @@ limitations under the License. */ namespace phi { -class CustomContext : public DeviceContext { +class CustomContext : public DeviceContext, + public TypeInfoTraits { public: explicit CustomContext(const CustomPlace&); @@ -35,6 +36,8 @@ class CustomContext : public DeviceContext { // Wait for all operations completion in the stream. void Wait() const override; + static const char* name() { return "CustomContext"; } + public: // NOTE: DeviceContext hold resources. Used in training scenarios. // The interface used by the training scene, DeviceContext will initialize diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index cf08f5b4affa9..69f0baf069984 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -717,6 +717,23 @@ struct GPUContext::Impl { } } + bool HasDnnAttr(const std::string& attr_name) const { + return dnn_attrs_.count(attr_name) != 0UL; + } + + const Attribute& GetDnnAttr(const std::string& attr_name) const { + auto iter = dnn_attrs_.find(attr_name); + PADDLE_ENFORCE_NE( + iter, + dnn_attrs_.end(), + phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext.")); + return iter->second; + } + + void SetDnnAttr(const std::string& attr_name, Attribute attr) { + dnn_attrs_[attr_name] = attr; + } + // use one flag for all handles? // they should be accessed consistently bool owned_{false}; @@ -780,8 +797,15 @@ struct GPUContext::Impl { Allocator* allocator_{nullptr}; // external resource. // A internal resouce to initinalize eigen_device. std::unique_ptr eigen_stream_{nullptr}; + + // Holds some attributes only used by the gpudnn kernel calculation + // Because DeviceContext is a global singleton, you need to ensure thread + // safety, use the thread_local variable + static thread_local AttributeMap dnn_attrs_; }; +thread_local AttributeMap GPUContext::Impl::dnn_attrs_ = {}; + GPUContext::GPUContext(GPUContext&&) = default; GPUContext& GPUContext::operator=(GPUContext&&) = default; @@ -1000,4 +1024,16 @@ void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; } void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; } +bool GPUContext::HasDnnAttr(const std::string& attr_name) const { + return impl_->HasDnnAttr(attr_name); +} + +const Attribute& GPUContext::GetDnnAttr(const std::string& attr_name) const { + return impl_->GetDnnAttr(attr_name); +} + +void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { + return impl_->SetDnnAttr(attr_name, std::move(attr)); +} + } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 989bbbcbbf5f8..84aba73fad1e1 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/attribute.h" #include "paddle/phi/core/device_context.h" namespace phi { @@ -77,7 +78,8 @@ class DnnWorkspaceHandle { std::unique_ptr mtx_; }; -class PADDLE_API GPUContext : public DeviceContext { +class PADDLE_API GPUContext : public DeviceContext, + public TypeInfoTraits { public: explicit GPUContext(const GPUPlace& place, bool init = true); @@ -166,6 +168,13 @@ class PADDLE_API GPUContext : public DeviceContext { void WaitStreamCallback() const; + // Several methods for adapting Dnn-specific attributes + bool HasDnnAttr(const std::string& attr_name) const; + const Attribute& GetDnnAttr(const std::string& attr_name) const; + void SetDnnAttr(const std::string& attr_name, Attribute attr); + + static const char* name() { return "GPUContext"; } + public: /*! \brief Return nccl communicators. */ ncclComm_t nccl_comm() const; @@ -250,10 +259,10 @@ class PADDLE_API GPUContext : public DeviceContext { std::unique_ptr impl_; }; -// Note: In order to register the kernel of CUDNN, GPUDNNContext is required. +// Note: In order to register the kernel of CUDNN, DnnContext is required. // Currently, CUDNN kernel directly uses GPUContext. But if the kernel function // has the same name, this will lead to duplicate instantiations of GPU kernel -// and GPUDNN kernel function, so if we using GPUDNNContext = GPUContext, we +// and Dnn kernel function, so if we using DnnContext = GPUContext, we // must use different function name for cudnn kernel using GPUDNNContext = GPUContext; diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index 950483a469ed8..6c69191c94457 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -16,9 +16,10 @@ #include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" +#include "paddle/utils/flat_hash_map.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/expect.h" namespace phi { @@ -284,6 +285,69 @@ struct OneDNNContext::Impl { return key_it->second; } + bool HasDnnAttr(const std::string& attr_name) const { + return dnn_attrs_.count(attr_name) != 0UL; + } + const Attribute& GetDnnAttr(const std::string& attr_name) const { + auto iter = dnn_attrs_.find(attr_name); + PADDLE_ENFORCE_NE( + iter, + dnn_attrs_.end(), + phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext.")); + return iter->second; + } + + void SetDnnAttr(const std::string& attr_name, Attribute attr) { + dnn_attrs_[attr_name] = attr; + } + + bool HasDnnInput(const std::string& input_name) const { + return dnn_inputs_.count(input_name) != 0UL; + } + + const DenseTensor* GetDnnInput(const std::string& input_name) const { + auto iter = dnn_inputs_.find(input_name); + PADDLE_ENFORCE_NE( + iter, + dnn_inputs_.end(), + phi::errors::NotFound( + "Input DenseTensor `%s` is not found in OneDNNContext.")); + return iter->second; + } + + void SetDnnInput(const std::string& input_name, const DenseTensor* input) { + dnn_inputs_[input_name] = input; + } + + void SetInputsName(const TensorNameMap& inputs_name) { + inputs_name_ = inputs_name; + } + + void SetOutputsName(const TensorNameMap& outputs_name) { + outputs_name_ = outputs_name; + } + + const std::vector& GetInputsName( + const std::string& input) const { + auto it = inputs_name_.find(input); + PADDLE_ENFORCE_NE(it, + inputs_name_.end(), + phi::errors::NotFound( + "OneDnnContext does not have the input %s.", input)); + return it->second; + } + + const std::vector& GetOutputsName( + const std::string& output) const { + auto it = outputs_name_.find(output); + PADDLE_ENFORCE_NE( + it, + outputs_name_.end(), + phi::errors::NotFound("OneDnnContext does not have the output %s.", + output)); + return it->second; + } + std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed // to erase @@ -291,8 +355,35 @@ struct OneDNNContext::Impl { std::shared_ptr p_mutex_; // 0 - clearing is allowed. x > 0 do not clear. unsigned int block_next_cache_clearing_ = 0; + + // Holds some attributes only used by the onednn kernel calculation + // Since original mkldnn op kernel directly adds the operations that require + // fusion to the native kernel operations, and uses the attribute `fuse_xxx` + // to control, for onednn, there will be some attributes that seem to be + // independent of the device are also saved here. + // Here, the operation of fusion needs to be implemented separately as + // a fusion op and kernel, instead of patching it to a basic operation. + // Because DeviceContext is a global singleton, you need to ensure thread + // safety, use the thread_local variable + static thread_local AttributeMap dnn_attrs_; + // For onednn, in addition to extra attrs, there are also extra inputs, + // but the number is small. Hope that the implementation can be optimized + // to remove this member in the future. + static thread_local paddle::flat_hash_map + dnn_inputs_; + + // Onednn need get input and output's name in current Kernel for generating + // unique_key. + static thread_local TensorNameMap inputs_name_; + static thread_local TensorNameMap outputs_name_; }; +thread_local AttributeMap OneDNNContext::Impl::dnn_attrs_ = {}; +thread_local paddle::flat_hash_map + OneDNNContext::Impl::dnn_inputs_ = {}; +thread_local TensorNameMap OneDNNContext::Impl::inputs_name_ = {}; +thread_local TensorNameMap OneDNNContext::Impl::outputs_name_ = {}; + OneDNNContext::OneDNNContext(const Place& place) : CPUContext(place), impl_(std::make_unique()) {} @@ -322,5 +413,49 @@ OneDNNContext::BlobPtr_t OneDNNContext::GetBlob( return impl_->GetBlob(name); } +bool OneDNNContext::HasDnnAttr(const std::string& attr_name) const { + return impl_->HasDnnAttr(attr_name); +} + +const Attribute& OneDNNContext::GetDnnAttr(const std::string& attr_name) const { + return impl_->GetDnnAttr(attr_name); +} + +void OneDNNContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { + return impl_->SetDnnAttr(attr_name, std::move(attr)); +} + +bool OneDNNContext::HasDnnInput(const std::string& input_name) const { + return impl_->HasDnnInput(input_name); +} + +const DenseTensor* OneDNNContext::GetDnnInput( + const std::string& input_name) const { + return impl_->GetDnnInput(input_name); +} + +void OneDNNContext::SetDnnInput(const std::string& input_name, + const DenseTensor* input) { + return impl_->SetDnnInput(input_name, input); +} + +void OneDNNContext::SetInputsName(const TensorNameMap& inputs_name) { + impl_->SetInputsName(inputs_name); +} + +void OneDNNContext::SetOutputsName(const TensorNameMap& outputs_name) { + impl_->SetOutputsName(outputs_name); +} + +const std::vector& OneDNNContext::GetInputsName( + const std::string& input) const { + return impl_->GetInputsName(input); +} + +const std::vector& OneDNNContext::GetOutputsName( + const std::string& output) const { + return impl_->GetOutputsName(output); +} + } // namespace phi #endif diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index d7cf8a0ff4902..9035aef5f9a32 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -20,9 +20,12 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/attribute.h" namespace phi { +using TensorNameMap = std::map>; + class OneDNNContextThreadLocals { // default mkldnn session id @@ -134,6 +137,26 @@ class OneDNNContext : public CPUContext { return OneDNNContextThreadLocals::fetch(); } + // Several methods for adapting ONEDNN-specific attributes and inputs + bool HasDnnAttr(const std::string& attr_name) const; + const Attribute& GetDnnAttr(const std::string& attr_name) const; + void SetDnnAttr(const std::string& attr_name, Attribute attr); + + bool HasDnnInput(const std::string& input_name) const; + const DenseTensor* GetDnnInput(const std::string& input_name) const; + void SetDnnInput(const std::string& input_name, const DenseTensor* input); + + void SetInputsName(const TensorNameMap& inputs_name); + + void SetOutputsName(const TensorNameMap& outputs_name); + + const std::vector& GetInputsName(const std::string& input) const; + + const std::vector& GetOutputsName( + const std::string& output) const; + + static const char* name() { return "OneDNNContext"; } + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h index e91e02282ccc0..11cc9c29f501f 100644 --- a/paddle/phi/backends/onednn/onednn_helper.h +++ b/paddle/phi/backends/onednn/onednn_helper.h @@ -195,6 +195,41 @@ inline std::string CreateKey(const OneDNNContext& dev_ctx, ArgTypes&&... args) { return key; } +inline std::vector> ToOnednnPadding( + const std::vector& paddings) { + if (paddings.size() == 6) { + int padding_front = paddings[0]; + int padding_back = paddings[1]; + int padding_top = paddings[2]; + int padding_bottom = paddings[3]; + int padding_left = paddings[4]; + int padding_right = paddings[5]; + + return {{padding_front, padding_top, padding_left}, + {padding_back, padding_bottom, padding_right}}; + } else { + int padding_top = paddings[0]; + int padding_bottom = paddings[1]; + int padding_left = paddings[2]; + int padding_right = paddings[3]; + + return {{padding_top, padding_left}, {padding_bottom, padding_right}}; + } +} + +// The function adjusts the vector of weight dimensions for group convolutions +inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT + const int groups) { + if (groups > 1) { + // if (is_conv3d) [o, i, d, h, w]->[g, o/g, i, d, h, w] + // else [o, i, h, w] -> [g, o/g, i, h, w] + weights_tz.push_back(0); + std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end()); + weights_tz[0] = groups; + weights_tz[1] = weights_tz[1] / groups; + } +} + inline void MatchShapeToLayout(DenseTensor* tensor_in, DataLayout from, DataLayout to) { diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index d1810090cd539..4a28c4262f32d 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -39,6 +39,67 @@ using memory = dnnl::memory; using OneDNNMemoryFormat = dnnl::memory::format_tag; +static void AppendActivation(const OneDNNContext& dev_ctx, + dnnl::post_ops& post_ops, // NOLINT + float activation_scale = 1.0f) { + const auto invalid_attribute = + dev_ctx.HasDnnAttr("fuse_activation") + ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation")) + .empty() + : true; + if (invalid_attribute) return; + + const auto fuse_activation = + dev_ctx.HasDnnAttr("fuse_activation") + ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation")) + : ""; + const auto fuse_alpha = + dev_ctx.HasDnnAttr("fuse_alpha") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha")) + : 0.0f; + const auto fuse_beta = + dev_ctx.HasDnnAttr("fuse_beta") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta")) + : 0.0f; + + if (fuse_activation == "hard_sigmoid") { + post_ops.append_eltwise(activation_scale, + dnnl::algorithm::eltwise_linear, + fuse_alpha, + fuse_beta); + post_ops.append_eltwise( + activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); + } else { + const std::unordered_map activation_map = { + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"hard_swish", dnnl::algorithm::eltwise_hardswish}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"mish", dnnl::algorithm::eltwise_mish}, + {"relu", dnnl::algorithm::eltwise_relu}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"tanh", dnnl::algorithm::eltwise_tanh}}; + + const auto& activation_type = activation_map.find(fuse_activation); + + PADDLE_ENFORCE_NE( + activation_type, + activation_map.end(), + phi::errors::InvalidArgument( + "Activation '%s' not found in oneDNN algorithms mapper", + fuse_activation)); + + post_ops.append_eltwise( + activation_scale, activation_type->second, fuse_alpha, fuse_beta); + } +} + template (input_data)); } }; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 90fc7c97b785c..61967a9a58d39 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_XPU + #include #include "paddle/phi/backends/xpu/forwards.h" @@ -26,7 +28,8 @@ namespace xpu = baidu::xpu::api; namespace phi { -class XPUContext : public DeviceContext { +class XPUContext : public DeviceContext, + public TypeInfoTraits { public: XPUContext(); @@ -65,6 +68,8 @@ class XPUContext : public DeviceContext { XPUStream stream() const; + static const char* name() { return "XPUContext"; } + private: struct Impl; std::unique_ptr impl_; @@ -79,3 +84,5 @@ using KPSContext = XPUContext; #endif } // namespace phi + +#endif diff --git a/paddle/phi/core/attribute.h b/paddle/phi/core/attribute.h index d8d684b9030e9..75a6b16f46e0b 100644 --- a/paddle/phi/core/attribute.h +++ b/paddle/phi/core/attribute.h @@ -48,6 +48,6 @@ using Attribute = paddle::variant; -using RuntimeAttrs = paddle::flat_hash_map; +using AttributeMap = paddle::flat_hash_map; } // namespace phi diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index c845d50f77564..5dad261f43b34 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/generator.h" +#include "paddle/phi/core/utils/type_registry.h" namespace phi { class TensorBase; @@ -188,9 +189,21 @@ class PADDLE_API DeviceContext { */ Generator* GetHostGenerator() const; + /** + * @brief Return the type information of the derived class to support + * safely downcast in non-rtti environment. + * + * @return The type information of the derived class. + */ + TypeInfo type_info() const { return type_info_; } + private: struct Impl; std::unique_ptr impl_; + + template + friend class TypeInfoTraits; + TypeInfo type_info_{TypeInfo::kUnknownType}; }; } // namespace phi diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 251916d8c1a15..cfe8b47ef9afa 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -43,6 +43,7 @@ limitations under the License. */ #include "paddle/phi/core/errors.h" #include "paddle/utils/string/printf.h" #include "paddle/utils/string/to_string.h" +#include "paddle/utils/variant.h" DECLARE_int32(call_stack_level); @@ -409,80 +410,75 @@ struct EnforceNotMet : public std::exception { /** EXTENDED TOOL FUNCTIONS WITH CHECKING **/ /* - * Summary: This macro is used to get Variable or internal type - * data (such as LoDTensor or SelectedRows) of the Input and - * Output in op, generally used when call scope.FindVar(Input/ - * Output("Name")) or ctx.Input(). - * Firstly this macro check whether the obtained pointer is null, - * and then return data if it is not null. - * - * Note: This macro is only suitable for specific scenarios and - * does not intended to be widely used. If it cannot meet the - * requirements, please use other PADDLE_ENFORCE** check macro. + * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get + * safely. paddle::get is not a completely safe api, although it will not + * go wrong in most cases, but in extreme cases, it may fail and directly + * throw a paddle::bad_variant_access const exception, without any stack + *information. + * This kind of problems is difficult to debug, so add these macros to + * enrich paddle::get error information. At the same time, we restrict + * the direct use of paddle::get by CI rule. * * Parameters: - *     __PTR: pointer - * __ROLE: (string), Input or Output - * __NAME: (string), Input or Output name - * __OP_TYPE: (string), the op type - * - * Return: The data pointed to by the pointer. + *     __TYPE: the target variable type + * __VALUE: the target variable to get * * Examples: - * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); - */ -#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ - (([&]() -> std::add_lvalue_reference::type { \ - auto* __ptr = (__PTR); \ - if (UNLIKELY(nullptr == __ptr)) { \ - auto __summary__ = phi::errors::NotFound( \ - "Unable to get %s data of %s %s in operator %s. " \ - "Possible reasons are:\n" \ - " 1. The %s is not the %s of operator %s;\n" \ - " 2. The %s has no corresponding variable passed in;\n" \ - " 3. The %s corresponding variable is not initialized.", \ - phi::demangle( \ - typeid(std::add_lvalue_reference::type) \ - .name()), \ - __ROLE, \ - __NAME, \ - __OP_TYPE, \ - __NAME, \ - __ROLE, \ - __OP_TYPE, \ - __NAME, \ - __NAME); \ - auto __message__ = ::paddle::string::Sprintf( \ - "%s\n [Hint: pointer " #__PTR " should not be null.]", \ - __summary__.error_message()); \ - __THROW_ERROR_INTERNAL__( \ - phi::ErrorSummary(__summary__.code(), __message__)); \ - } \ - return *__ptr; \ - })()) - -/* - * Summary: This macro is used to check whether op has specified - * Input or Output Variables. Because op's Input and Output - * checking are written similarly, so abstract this macro. - * - * Parameters: - *     __EXPR: (bool), the bool expression - * __ROLE: (string), Input or Output - * __NAME: (string), Input or Output name - * __OP_TYPE: (string), the op type + * - unsafe writing: int x = paddle::get(y); + * - safe writing: int x = PADDLE_GET(int, y); * - * Examples: - * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); + * Note: GCC 4.8 cannot select right overloaded function here, so need + * to define different functions and macros here, after we upgrade + * CI gcc version, we can only define one PADDLE_GET macro. */ -#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ - do { \ - PADDLE_ENFORCE_EQ( \ - __EXPR, \ - true, \ - phi::errors::NotFound( \ - "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \ - } while (0) +namespace details { + +#define DEFINE_SAFE_PADDLE_GET( \ + __InputType, __OutputType, __OutputTypePtr, __FuncName) \ + template \ + auto __FuncName( \ + __InputType input, const char* expression, const char* file, int line) \ + ->typename std::conditional::value, \ + __OutputTypePtr, \ + __OutputType>::type { \ + try { \ + return paddle::get(input); \ + } catch (paddle::bad_variant_access const&) { \ + HANDLE_THE_ERROR \ + throw ::phi::enforce::EnforceNotMet( \ + phi::errors::InvalidArgument( \ + "paddle::get failed, cannot get value " \ + "(%s) by type %s, its type is %s.", \ + expression, \ + phi::enforce::demangle(typeid(OutputType).name()), \ + phi::enforce::demangle(input.type().name())), \ + file, \ + line); \ + END_HANDLE_THE_ERROR \ + } \ + } + +DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet); +DEFINE_SAFE_PADDLE_GET(const InputType&, + const OutputType&, + const OutputType*, + SafeBoostGetConst); +DEFINE_SAFE_PADDLE_GET(InputType&&, + OutputType, + OutputType*, + SafeBoostGetMutable); + +} // namespace details + +#define PADDLE_GET(__TYPE, __VALUE) \ + phi::enforce::details::SafeBoostGet<__TYPE>( \ + __VALUE, #__VALUE, __FILE__, __LINE__) +#define PADDLE_GET_CONST(__TYPE, __VALUE) \ + phi::enforce::details::SafeBoostGetConst<__TYPE>( \ + __VALUE, #__VALUE, __FILE__, __LINE__) +#define PADDLE_GET_MUTABLE(__TYPE, __VALUE) \ + phi::enforce::details::SafeBoostGetMutable<__TYPE>( \ + __VALUE, #__VALUE, __FILE__, __LINE__) } // namespace enforce using namespace enforce; // NOLINT diff --git a/paddle/fluid/framework/expect.h b/paddle/phi/core/expect.h similarity index 100% rename from paddle/fluid/framework/expect.h rename to paddle/phi/core/expect.h diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 107a1fe49c98f..58afc8c5fd251 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -138,8 +138,6 @@ class KernelContext { template const AttrType& AttrAt(size_t idx) const; - const RuntimeAttrs& GetRuntimeAttrs() const { return runtime_attrs_; } - size_t InputsSize() const { return inputs_.size(); } size_t OutputsSize() const { return outputs_.size(); } size_t AttrsSize() const { return attrs_.size(); } @@ -161,8 +159,6 @@ class KernelContext { paddle::small_vector, kInputSmallVectorSize> input_range_; paddle::small_vector, kOutputSmallVectorSize> output_range_; - - RuntimeAttrs runtime_attrs_; }; } // namespace phi diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 7ae01b7c725f0..396b17dd401d5 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -233,8 +233,6 @@ struct KernelArgsParseFunctor { args_def->AppendAttribute(AttributeType::DATA_LAYOUT); } else if (arg_type == std::type_index(typeid(Place))) { args_def->AppendAttribute(AttributeType::PLACE); - } else if (arg_type == std::type_index(typeid(RuntimeAttrs))) { - // do nothing } else { PADDLE_THROW(phi::errors::Unavailable( "Unsupported kernel argument type `%s`.", arg_type.name())); diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index cdfdefa059cd7..55ea3a31eb318 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -14,13 +14,7 @@ #pragma once -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/custom/custom_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/onednn/onednn_context.h" -#ifdef PADDLE_WITH_XPU -#include "paddle/phi/backends/xpu/xpu_context.h" -#endif +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" @@ -330,21 +324,6 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(TensorArray); - template - struct KernelCallHelper { - template - static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { - const auto& runtime_attrs = ctx->GetRuntimeAttrs(); - KernelCallHelper:: - template Compute( - ctx, pargs..., runtime_attrs); - } - }; - /* End case */ template struct KernelCallHelper> { diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 8550287c70704..48e72f503bf3c 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -409,12 +409,9 @@ void ConvInferMeta(const MetaTensor& input, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config) { std::vector paddings = paddings_t; @@ -559,27 +556,27 @@ void ConvInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } -void ConvInferInferMeta(const MetaTensor& input, - const MetaTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - MetaTensor* out, - MetaConfig config) { +void Conv3DInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { ConvInferMeta(input, filter, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - /*use_addto=*/false, - /*workspace_size_MB=*/512, // useless in infermeta - /*exhaustive_search=*/false, out, config); } @@ -922,6 +919,31 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits, loss->share_lod(logits); } +void DepthwiseConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { + ConvInferMeta(input, + filter, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + out, + config); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -2876,4 +2898,3 @@ void Unpool3dInferMeta(const MetaTensor& x, } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); -PD_REGISTER_INFER_META_FN(conv2d_infer, phi::ConvInferInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d310f6282b09d..30e22cb3f56a6 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -80,26 +80,26 @@ void ConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, + const std::string& padding_algorithm, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config = MetaConfig()); -void ConvInferInferMeta(const MetaTensor& input, - const MetaTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - MetaTensor* out, - MetaConfig config = MetaConfig()); +void Conv3DInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); void ConvTransposeInferMeta(const MetaTensor& x, const MetaTensor& filter, @@ -143,6 +143,20 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits, MetaTensor* loss, MetaConfig config = MetaConfig()); +void DepthwiseConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h deleted file mode 100644 index f25cbe384c213..0000000000000 --- a/paddle/phi/kernels/conv_grad_grad_kernel.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void ConvGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad); - -template -void Conv3DGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad); - -} // namespace phi diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h index a6b970e0996be..4164db5f8a01b 100644 --- a/paddle/phi/kernels/conv_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -25,13 +25,10 @@ void ConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, + const std::string& padding_algorithm, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad); @@ -42,7 +39,7 @@ void Conv3DGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -59,7 +56,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -70,4 +67,41 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, DenseTensor* input_grad, DenseTensor* filter_grad); +template +void ConvGradGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad); + +template +void Conv3DGradGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad); + } // namespace phi diff --git a/paddle/phi/kernels/conv_kernel.cc b/paddle/phi/kernels/conv_kernel.cc deleted file mode 100644 index 542a4ec8a61c8..0000000000000 --- a/paddle/phi/kernels/conv_kernel.cc +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_kernel.h" - -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void ConvInferKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - paddding_algorithm, - groups, - dilations, - data_format, - /*use_addto=*/false, - /*workspace_size_MB=*/ - paddle::platform::GetDefaultConvWorkspaceSizeLimitMB(), - /*exhaustive_search=*/false, - out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - conv2d_infer, CPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - conv2d_infer, GPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {} -#endif diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h index a105fe794f94d..06faee6d3aa1e 100644 --- a/paddle/phi/kernels/conv_kernel.h +++ b/paddle/phi/kernels/conv_kernel.h @@ -25,12 +25,9 @@ void ConvKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, - int groups, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out); template @@ -54,7 +51,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -64,16 +61,4 @@ void DepthwiseConvKernel(const Context& dev_ctx, bool fuse_relu, DenseTensor* out); -template -void ConvInferKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out); - } // namespace phi diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc deleted file mode 100644 index 3289c8f5c84d6..0000000000000 --- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" - -namespace phi { -template -void Conv3DGradGradKernel(const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { -} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - CPU, - ALL_LAYOUT, - phi::Conv3DGradGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 880837dd7cd61..06a63267c5c96 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -27,7 +27,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -43,13 +43,10 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } @@ -61,7 +58,7 @@ void Conv3DGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -76,17 +73,50 @@ void Conv3DGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } +template +void Conv3DGradGradKernel(const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + ConvGradGradKernel(ctx, + input, + filter, + out_grad, + input_grad_grad, + filter_grad_grad, + strides, + paddings_t, + padding_algorithm, + dilations_t, + groups, + data_format, + input_grad, + filter_grad, + out_grad_grad); +} + } // namespace phi PD_REGISTER_KERNEL( @@ -101,3 +131,14 @@ PD_REGISTER_KERNEL(depthwise_conv2d_grad, PD_REGISTER_KERNEL( conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + CPU, + ALL_LAYOUT, + phi::Conv3DGradGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc index ec3253194930b..12bfa852d96c9 100644 --- a/paddle/phi/kernels/cpu/conv_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -19,6 +19,30 @@ #include "paddle/phi/kernels/impl/conv_kernel_impl.h" namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* out) { + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + template void DepthwiseConvKernel(const Context& dev_ctx, const DenseTensor& input, @@ -34,19 +58,16 @@ void DepthwiseConvKernel(const Context& dev_ctx, bool exhaustive_search, bool fuse_relu, DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search, - out); + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); } template @@ -63,19 +84,16 @@ void Conv3DKernel(const Context& dev_ctx, int workspace_size_MB, bool exhaustive_search, DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search, - out); + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); } } // namespace phi diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc index 2d363189936b0..8ebb70653e072 100644 --- a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/kernels/erfinv_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc index f298cc358d662..8822d6e536cb6 100644 --- a/paddle/phi/kernels/cpu/erfinv_kernel.cc +++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc @@ -12,10 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/kernels/erfinv_kernel.h" -#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + ctx.template Alloc(out); + auto eigen_in = EigenVector::Flatten(x); + auto eigen_out = EigenVector::Flatten(*out); + auto& place = *ctx.eigen_device(); + constexpr T half = static_cast(0.5); + constexpr T half_sqrt = static_cast(M_SQRT1_2); + eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt; +} + +} // namespace phi PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu deleted file mode 100644 index 9d3e6da944a19..0000000000000 --- a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" - -PD_REGISTER_KERNEL( - conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { -} diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu index e4b094314f8c1..4f15030365a6c 100644 --- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -27,7 +27,7 @@ void Conv3DGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -42,13 +42,10 @@ void Conv3DGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } @@ -60,3 +57,7 @@ PD_REGISTER_KERNEL( PD_REGISTER_KERNEL( conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu index e548b6fbf6d7b..a089175c96fb6 100644 --- a/paddle/phi/kernels/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_kernel.cu @@ -20,6 +20,29 @@ namespace phi { +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* out) { + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + template void Conv3DKernel(const Context& dev_ctx, const DenseTensor& input, @@ -34,19 +57,16 @@ void Conv3DKernel(const Context& dev_ctx, int workspace_size_MB, bool exhaustive_search, DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search, - out); + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu index ce45d4e018e5a..034788502b043 100644 --- a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/kernels/erfinv_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu deleted file mode 100644 index de1360a5966f3..0000000000000 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ /dev/null @@ -1,824 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" -#else -#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" -#endif - -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/padding.h" -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" - -namespace phi { - -template -void ConvCudnnGradGradKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - auto X = &input; - auto W = &filter; - auto dO = &out_grad; - auto ddX = input_grad_grad.get_ptr(); - auto ddW = filter_grad_grad.get_ptr(); - - auto ddO = out_grad_grad; - auto dW = filter_grad; - auto dX = input_grad; - if (ddO) { - ctx.template Alloc(ddO); - phi::funcs::SetConstant set_zero; - set_zero(ctx, ddO, static_cast(0)); - } - if (dW) { - ctx.template Alloc(dW); - } - if (dX) { - ctx.template Alloc(dX); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - std::vector dilations = dilations_t; - - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, - false, - phi::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = paddings_t; - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - DenseTensor transformed_X_channel(X->type()); - DenseTensor transformed_dO_channel(dO->type()); - DenseTensor transformed_ddX_channel(X->type()); - - DenseTensor transformed_ddO_channel(dO->type()); - DenseTensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X_channel); - TransToChannelFirst(ctx, X, &transformed_X_channel); - - ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); - TransToChannelFirst(ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); - ctx.template Alloc(&transformed_dX_channel); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); - DenseTensor transformed_X(X->type()); - DenseTensor transformed_ddX(X->type()); - - DenseTensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - DDim new_input_shape(make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - ctx.template Alloc(&transformed_X); - - if (ddX) { - ctx.template Alloc(&transformed_ddX); - } - if (dX) { - ctx.template Alloc(&transformed_dX); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - funcs::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - funcs::PadFunction(ctx, - input_pad, - transformed_ddX_channel, - pad_value, - &transformed_ddX); - } - } break; - case 5: { - funcs::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - funcs::PadFunction(ctx, - input_pad, - transformed_ddX_channel, - pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(phi::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = paddle::platform::CudnnDataType::type; - - auto handle = ctx.cudnn_handle(); - auto layout = paddle::platform::GetCudnnTensorFormat( - paddle::platform::DataLayout::kNCHW); - - ConvArgs args1{handle, - &transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - ConvArgs args2{handle, - &transformed_X, - ddW, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - ConvArgs args3{handle, - &transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - ConvArgs args4{handle, - &transformed_dX, - ddW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - -#ifdef PADDLE_WITH_HIP - SearchResult fwd_result1; - SearchResult fwd_result2; - SearchResult data_result; - SearchResult filter_result; -#else - SearchResult fwd_result1; - SearchResult fwd_result2; - SearchResult data_result; - SearchResult filter_result; -#endif - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_result1.algo = search1::Find( - args1, exhaustive_search, false, workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); - workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_result2.algo = search2::Find( - args2, exhaustive_search, false, workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); - workspace_size = std::max( - workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_result.algo = search3::Find( - args3, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_result = - search3::Find(ctx, args3, exhaustive_search, deterministic); - workspace_size = std::max( - workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_result.algo = search4::Find( - args4, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_result = - search4::Find(ctx, args4, exhaustive_search, deterministic); - workspace_size = std::max( - workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW( - transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), - DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto workspace_handle = ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx, - args1.wdesc.desc(), - w, - args1.cdesc.desc(), - fwd_result1.algo, - &beta, - args1.odesc.desc(), - transformed_ddy_channel, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx + i * group_offset_in, - args1.wdesc.desc(), - w + i * group_offset_filter, - args1.cdesc.desc(), - fwd_result1.algo, - workspace_ptr, - workspace_size, - &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x, - args2.wdesc.desc(), - ddw, - args2.cdesc.desc(), - fwd_result2.algo, - &beta, - args2.odesc.desc(), - transformed_ddy_channel, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x + i * group_offset_in, - args2.wdesc.desc(), - ddw + i * group_offset_filter, - args2.cdesc.desc(), - fwd_result2.algo, - workspace_ptr, - workspace_size, - &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardWeights( - handle, - &alpha, - args3.odesc.desc(), - transformed_dy_channel, - args3.idesc.desc(), - ddx, - args3.cdesc.desc(), - filter_result.algo, - &beta, - args3.wdesc.desc(), - dw, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args3.idesc.desc(), - ddx + i * group_offset_in, - args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), - filter_result.algo, - workspace_ptr, - workspace_size, - &beta, - args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardData( - handle, - &alpha, - args4.odesc.desc(), - transformed_dy_channel, - args4.wdesc.desc(), - ddw, - args4.cdesc.desc(), - data_result.algo, - &beta, - args4.idesc.desc(), - transformed_dx, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args4.wdesc.desc(), - ddw + i * group_offset_filter, - args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), - data_result.algo, - workspace_ptr, - workspace_size, - &beta, - args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX_channel, dX); - } - } -} - -template -void DepthwiseConvDoubleGradGPUDNNKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - bool fuse_relu, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvCudnnGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -template -void Conv3DCudnnGradGradKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvCudnnGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -} // namespace phi - -#ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - phi::dtype::float16) {} -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -#else - -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - double, - phi::dtype::float16) {} - -#endif - -#endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 9a5bd1c5bce9e..dcd1e133c729d 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -44,12 +44,9 @@ void ConvCudnnGradKernel(const Context& ctx, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, DenseTensor* input_grad, DenseTensor* filter_grad) { if (input_grad) { @@ -59,11 +56,25 @@ void ConvCudnnGradKernel(const Context& ctx, ctx.template Alloc(filter_grad); } + bool has_use_addto = ctx.HasDnnAttr("use_addto"); + VLOG(4) << "GPUContext contains `use_addto`: " << has_use_addto; + bool use_addto = has_use_addto + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("use_addto")) + : false; + std::vector dilations = dilations_t; std::vector strides = strides_t; std::vector paddings = paddings_t; - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; auto exhaustive_deterministic = exhaustive_search && deterministic; PADDLE_ENFORCE_EQ(exhaustive_deterministic, @@ -588,7 +599,7 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -603,13 +614,10 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } @@ -621,7 +629,7 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -637,17 +645,717 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } +template +void ConvCudnnGradGradKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + const std::vector& dilations_t, + int groups, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + auto X = &input; + auto W = &filter; + auto dO = &out_grad; + auto ddX = input_grad_grad.get_ptr(); + auto ddW = filter_grad_grad.get_ptr(); + + auto ddO = out_grad_grad; + auto dW = filter_grad; + auto dX = input_grad; + if (ddO) { + ctx.template Alloc(ddO); + phi::funcs::SetConstant set_zero; + set_zero(ctx, ddO, static_cast(0)); + } + if (dW) { + ctx.template Alloc(dW); + } + if (dX) { + ctx.template Alloc(dX); + } + + // const T* x = X->data(); + const T* dy = dO->data(); + const T* w = W->data(); + + const T* ddx = nullptr; + const T* ddw = nullptr; + T *dw, *dx, *ddy; + dw = dx = ddy = nullptr; + T* transformed_dx = nullptr; + std::vector dilations = dilations_t; + + bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensors to channel first----------- + DenseTensor transformed_X_channel(X->type()); + DenseTensor transformed_dO_channel(dO->type()); + DenseTensor transformed_ddX_channel(X->type()); + + DenseTensor transformed_ddO_channel(dO->type()); + DenseTensor transformed_dX_channel(X->type()); + + if (channel_last) { + ResizeToChannelFirst(ctx, X, &transformed_X_channel); + TransToChannelFirst(ctx, X, &transformed_X_channel); + + ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); + TransToChannelFirst(ctx, dO, &transformed_dO_channel); + + if (ddX) { + ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); + TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); + } + + if (ddO) { + ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); + } + if (dX) { + ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); + ctx.template Alloc(&transformed_dX_channel); + } + + } else { + transformed_X_channel = *X; + transformed_dO_channel = *dO; + if (ddX) { + transformed_ddX_channel = *ddX; + } + if (ddO) { + transformed_ddO_channel.ShareDataWith(*ddO); + } + if (dX) { + transformed_dX_channel.ShareDataWith(*dX); + } + } + + auto in_dims = transformed_X_channel.dims(); + auto filter_dims = W->dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + DenseTensor transformed_X(X->type()); + DenseTensor transformed_ddX(X->type()); + + DenseTensor transformed_dX(X->type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(X->dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_X_channel.dims()[0]; + new_input_shape_vec[1] = transformed_X_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_X_channel.dims()[i + 2] + padding_diff[i]; + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_X.Resize(new_input_shape); + transformed_ddX.Resize(new_input_shape); + transformed_dX.Resize(new_input_shape); + + ctx.template Alloc(&transformed_X); + + if (ddX) { + ctx.template Alloc(&transformed_ddX); + } + if (dX) { + ctx.template Alloc(&transformed_dX); + } + + // pad for input + const int rank = X->dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + case 5: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_X.ShareDataWith(transformed_X_channel); + if (ddX) { + transformed_ddX.ShareDataWith(transformed_ddX_channel); + } + if (dX) { + transformed_dX.ShareDataWith(transformed_dX_channel); + } + + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* x = transformed_X.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = paddle::platform::CudnnDataType::type; + + auto handle = ctx.cudnn_handle(); + auto layout = paddle::platform::GetCudnnTensorFormat( + paddle::platform::DataLayout::kNCHW); + + ConvArgs args1{handle, + &transformed_ddX, + W, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + ConvArgs args2{handle, + &transformed_X, + ddW, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + ConvArgs args3{handle, + &transformed_ddX, + dW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + ConvArgs args4{handle, + &transformed_dX, + ddW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + +#ifdef PADDLE_WITH_HIP + SearchResult fwd_result1; + SearchResult fwd_result2; + SearchResult data_result; + SearchResult filter_result; +#else + SearchResult fwd_result1; + SearchResult fwd_result2; + SearchResult data_result; + SearchResult filter_result; +#endif + + // ddo = conv(ddI, W) + conv(I, ddW) + size_t workspace_size = 0; + + T* transformed_ddy_channel = nullptr; + if (ddO) { + ddy = ddO->data(); + transformed_ddy_channel = transformed_ddO_channel.data(); + if (ddX) { + args1.idesc.set(transformed_ddX, iwo_group); + args1.wdesc.set(*W, layout, iwo_group); + args1.odesc.set(transformed_ddO_channel, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search1 = SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_result1.algo = search1::Find( + args1, exhaustive_search, false, workspace_size, ctx); +#else + using search1 = SearchAlgorithm; + fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); + workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); +#endif + } + + if (ddW) { + ddw = ddW->data(); + args2.idesc.set(transformed_X, iwo_group); + args2.wdesc.set(*ddW, layout, iwo_group); + args2.odesc.set(transformed_ddO_channel, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_result2.algo = search2::Find( + args2, exhaustive_search, false, workspace_size, ctx); +#else + using search2 = SearchAlgorithm; + fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); +#endif + } + } + + if (dW && ddX) { + dw = dW->data(); + args3.idesc.set(transformed_ddX, iwo_group); + args3.wdesc.set(*dW, layout, iwo_group); + args3.odesc.set(transformed_dO_channel, iwo_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search3 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_result.algo = search3::Find( + args3, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search3 = SearchAlgorithm; + filter_result = + search3::Find(ctx, args3, exhaustive_search, deterministic); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); +#endif + } + + if (ddW && dX) { + transformed_dx = transformed_dX.data(); + + args4.idesc.set(transformed_dX, iwo_group); + args4.wdesc.set(*ddW, layout, iwo_group); + args4.odesc.set(transformed_dO_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search4 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_result.algo = search4::Find( + args4, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search4 = SearchAlgorithm; + data_result = + search4::Find(ctx, args4, exhaustive_search, deterministic); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW( + transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(transformed_dO_channel.dims(), + DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = W->numel() / groups; + + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : + // 0.0f; + // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + if (ddO) { + if (ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx, + args1.wdesc.desc(), + w, + args1.cdesc.desc(), + fwd_result1.algo, + &beta, + args1.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx + i * group_offset_in, + args1.wdesc.desc(), + w + i * group_offset_filter, + args1.cdesc.desc(), + fwd_result1.algo, + workspace_ptr, + workspace_size, + &beta, + args1.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (ddW) { +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x, + args2.wdesc.desc(), + ddw, + args2.cdesc.desc(), + fwd_result2.algo, + &beta, + args2.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x + i * group_offset_in, + args2.wdesc.desc(), + ddw + i * group_offset_filter, + args2.cdesc.desc(), + fwd_result2.algo, + workspace_ptr, + workspace_size, + &alpha, + args2.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_ddO_channel, ddO); + } + } + T* transformed_dy_channel = transformed_dO_channel.data(); + if (dW && ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + transformed_dy_channel, + args3.idesc.desc(), + ddx, + args3.cdesc.desc(), + filter_result.algo, + &beta, + args3.wdesc.desc(), + dw, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args3.idesc.desc(), + ddx + i * group_offset_in, + args3.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args3.cdesc.desc(), + filter_result.algo, + workspace_ptr, + workspace_size, + &beta, + args3.wdesc.desc(), + dw + i * group_offset_filter)); + }, + workspace_size); + } +#endif + } + + if (dX && ddW) { + ddw = ddW->data(); +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args4.odesc.desc(), + transformed_dy_channel, + args4.wdesc.desc(), + ddw, + args4.cdesc.desc(), + data_result.algo, + &beta, + args4.idesc.desc(), + transformed_dx, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args4.wdesc.desc(), + ddw + i * group_offset_filter, + args4.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args4.cdesc.desc(), + data_result.algo, + workspace_ptr, + workspace_size, + &beta, + args4.idesc.desc(), + transformed_dx + i * group_offset_in)); + }, + workspace_size); + } +#endif + + if (!is_sys_pad) { + // reverse padded input + std::vector starts(X->dims().size(), 0); + std::vector axes(X->dims().size(), 0); + + for (size_t i = 0; i < X->dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + if (X->dims().size() == 4) { + RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } else { + RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_dX_channel, dX); + } + } +} + +template +void DepthwiseConvDoubleGradGPUDNNKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + ConvCudnnGradGradKernel(ctx, + input, + filter, + out_grad, + input_grad_grad, + filter_grad_grad, + strides, + paddings_t, + padding_algorithm, + dilations_t, + groups, + data_format, + input_grad, + filter_grad, + out_grad_grad); +} + +template +void Conv3DCudnnGradGradKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + ConvCudnnGradGradKernel(ctx, + input, + filter, + out_grad, + input_grad_grad, + filter_grad_grad, + strides, + paddings_t, + padding_algorithm, + dilations_t, + groups, + data_format, + input_grad, + filter_grad, + out_grad_grad); +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -671,6 +1379,26 @@ PD_REGISTER_KERNEL(depthwise_conv2d_grad, phi::DepthwiseConvCudnnGradKernel, float, phi::dtype::float16) {} +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvDoubleGradGPUDNNKernel, + float, + phi::dtype::float16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d_grad, @@ -690,6 +1418,32 @@ PD_REGISTER_KERNEL(conv3d_grad, double, phi::dtype::float16, phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvDoubleGradGPUDNNKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} #else PD_REGISTER_KERNEL(conv2d_grad, GPUDNN, @@ -707,6 +1461,29 @@ PD_REGISTER_KERNEL(conv3d_grad, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvDoubleGradGPUDNNKernel, + float, + double, + phi::dtype::float16) {} #endif #endif diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index bbac83475518e..3d1e7776ba394 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -42,18 +42,23 @@ void ConvCudnnKernel(const Context& ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, DenseTensor* output) { ctx.template Alloc(output); std::vector paddings = paddings_t; std::vector dilations = dilations_t; - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, false, @@ -402,12 +407,9 @@ void Conv3DCudnnKernel(const Context& dev_ctx, strides, paddings, padding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, out); } @@ -432,12 +434,9 @@ void DepthwiseConvCudnnKernel(const Context& dev_ctx, strides, paddings, padding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, out); } diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h deleted file mode 100644 index 512b1529f9191..0000000000000 --- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/phi/kernels/conv_kernel.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -template -void ConvGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides_t, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - const DenseTensor* X = &input; - const DenseTensor* dY = &out_grad; - const DenseTensor* ddX = input_grad_grad.get_ptr(); - const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); - - DenseTensor* ddY = out_grad_grad; - DenseTensor* dW = filter_grad; - DenseTensor* dX = input_grad; - DenseTensor W = filter; - - if (!ddY && !dW && !dX) return; - - const std::vector strides = strides_t; - std::vector paddings = paddings_t; - std::vector dilations = dilations_t; - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - DenseTensor transformed_X(X->type()); - DenseTensor transformed_dY(dY->type()); - DenseTensor transformed_ddX(X->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, X, &transformed_X); - TransToChannelFirst(dev_ctx, X, &transformed_X); - - ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); - TransToChannelFirst(dev_ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); - TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(vectorize(W.dims())); - std::vector output_shape_vec(vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - DDim col_shape(make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - DDim input_shape = - slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - DenseTensor col; - DenseTensor col_matrix; - if (is_expand) { - col.Resize(col_shape); - dev_ctx.template Alloc(&col); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dev_ctx.template Alloc(dX); - - DenseTensor transformed_dX(dX->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - paddle::operators::math::Col2VolFunctor col2vol; - paddle::operators::math:: - Col2ImFunctor - col2im; - - for (int i = 0; i < batch_size; i++) { - DenseTensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul( - ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, - col, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dev_ctx.template Alloc(dW); - set_zero(dev_ctx, dW, static_cast(0)); - DenseTensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - paddle::operators::math:: - Im2ColFunctor - im2col; - paddle::operators::math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - DenseTensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - ddx_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - dev_ctx.template Alloc(ddY); - - DenseTensor transformed_ddY(ddY->type()); - if (channel_last) { - ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - paddle::operators::math:: - Im2ColFunctor - im2col; - paddle::operators::math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - DenseTensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - DenseTensor ddy_slice = - ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - DenseTensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - DenseTensor ddx_slice = - ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - ddx_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); - } - - if (ddW_in) { - DenseTensor x_batch = - transformed_X.Slice(i, i + 1).Resize(input_shape); - DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - DenseTensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - x_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_ddY, ddY); - } - } -} - -} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h index 6674500c3c2e5..0d4cdddf6b520 100644 --- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -16,7 +16,6 @@ #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/phi/kernels/conv_grad_kernel.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -32,12 +31,9 @@ void ConvGradKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { // The filter and filter_grad will be reshaped in the calculations, @@ -254,4 +250,304 @@ void ConvGradKernel(const Context& dev_ctx, } } +template +void ConvGradGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + const std::vector& dilations_t, + int groups, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + const DenseTensor* X = &input; + const DenseTensor* dY = &out_grad; + const DenseTensor* ddX = input_grad_grad.get_ptr(); + const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); + + DenseTensor* ddY = out_grad_grad; + DenseTensor* dW = filter_grad; + DenseTensor* dX = input_grad; + DenseTensor W = filter; + + if (!ddY && !dW && !dX) return; + + const std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensor + DenseTensor transformed_X(X->type()); + DenseTensor transformed_dY(dY->type()); + DenseTensor transformed_ddX(X->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, X, &transformed_X); + TransToChannelFirst(dev_ctx, X, &transformed_X); + + ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); + TransToChannelFirst(dev_ctx, dY, &transformed_dY); + + if (ddX) { + ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); + TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); + } + } else { + transformed_X = *X; + transformed_dY = *dY; + if (ddX) { + transformed_ddX = *ddX; + } + } + + // update padding and dilation + auto in_dims = transformed_X.dims(); + auto filter_dims = W.dims(); + + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_X.dims()[0]); + std::vector filter_shape_vec(vectorize(W.dims())); + std::vector output_shape_vec(vectorize(transformed_dY.dims())); + + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + // col_shape [in_channel/group, kh, kw, oh, ow] + col_shape_vec[0] = transformed_X.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + // col_matrix_shape [in_channel/group * kh * kw, oh * ow] + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + // input_shape [Cin, H, W] + DDim input_shape = + slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); + // filter_matrix_shape [Cout, Cin * kh * kw] + DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; + + W.Resize(filter_matrix_shape); + DDim output_matrix_shape = { + transformed_dY.dims()[1], + transformed_dY.numel() / + (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; + int in_step = static_cast(transformed_X.dims()[1]) / groups; + int out_step = static_cast(transformed_dY.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + DenseTensor col; + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + dev_ctx.template Alloc(&col); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + // dx convolution double grad: gemm + col2im(col2vol) + // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, + // oH, oW) + if (dX && ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + dev_ctx.template Alloc(dX); + + DenseTensor transformed_dX(dX->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); + + } else { + transformed_dX = *dX; + } + // if is_expand is false, the operation of set_zero is unnecessary + // because math::matmul will reset dx + if (is_expand) { + set_zero(dev_ctx, &transformed_dX, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col_matrix.ShareDataWith(dx_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul( + ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &dx_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_dX, dX); + } + } + + // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, + // oH, oW) + // dw convolution double grad: im2col(vol2col) + gemm + if (dW && ddX) { + dev_ctx.template Alloc(dW); + set_zero(dev_ctx, dW, static_cast(0)); + DenseTensor dW_arr = *dW; + dW_arr.Resize(filter_matrix_shape); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; ++g) { + // im2col + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); + } + } + } + + // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), + // w/ddw(Cout, Cin, kh, kw) + // ddy convolution double grad: im2col(vol2col) + gemm + if (ddY) { + dev_ctx.template Alloc(ddY); + + DenseTensor transformed_ddY(ddY->type()); + if (channel_last) { + ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); + } else { + transformed_ddY = *ddY; + } + + set_zero(dev_ctx, &transformed_ddY, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor ddy_batch = + transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; ++g) { + // gemm + DenseTensor ddy_slice = + ddy_batch.Slice(g * out_step, (g + 1) * out_step); + + if (ddX) { + DenseTensor ddx_batch = + transformed_ddX.Slice(i, i + 1).Resize(input_shape); + DenseTensor ddx_slice = + ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); + } + + if (ddW_in) { + DenseTensor x_batch = + transformed_X.Slice(i, i + 1).Resize(input_shape); + DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + + DenseTensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + if (!is_expand) { + col.ShareDataWith(x_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + x_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_ddY, ddY); + } + } +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h index 2ef2ed8af2809..eb2d183981213 100644 --- a/paddle/phi/kernels/impl/conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -25,19 +25,16 @@ namespace phi { template -void ConvKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter_t, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* output) { +void ConvKernelImpl(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + DenseTensor* output) { std::vector paddings = paddings_t; std::vector dilations = dilations_t; DenseTensor filter = filter_t; diff --git a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h index ae76574e04e71..2d41d04f43c49 100644 --- a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h @@ -13,9 +13,6 @@ // limitations under the License. #pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows -#endif #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/impl/erfinv_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_kernel_impl.h deleted file mode 100644 index c0fb8a01b9971..0000000000000 --- a/paddle/phi/kernels/impl/erfinv_kernel_impl.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows -#endif -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace phi { - -template -void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - ctx.template Alloc(out); - auto eigen_in = EigenVector::Flatten(x); - auto eigen_out = EigenVector::Flatten(*out); - auto& place = *ctx.eigen_device(); - constexpr T half = static_cast(0.5); - constexpr T half_sqrt = static_cast(M_SQRT1_2); - eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt; -} - -} // namespace phi diff --git a/paddle/phi/kernels/onednn/conv_grad_kernel.cc b/paddle/phi/kernels/onednn/conv_grad_kernel.cc new file mode 100644 index 0000000000000..69c8122966308 --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/data_layout_transform.h" +#include "paddle/phi/kernels/onednn/conv_handler.h" + +namespace phi { + +#define PD_VISIT_FLOAT_AND_BF16_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::BFLOAT16, \ + ::phi::dtype::bfloat16, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType(), + AllocationType::CPU, + phi::errors::PreconditionNotMet( + "Operator DNNL ConvGrad must use CPUPlace")); + const auto& onednn_engine = dev_ctx.GetEngine(); + + const auto* bias = + dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr; + bool is_test = dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false; + + if (!input_grad && !filter_grad) return; + + const std::string& unique_name = + dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0]; + + PD_VISIT_FLOAT_AND_BF16_TYPES( + filter.dtype(), "ConvOneDNNHandlerT", ([&] { + // TODO(jczaja): Are all tensors really needed? + onednn::ConvOneDNNHandlerT handler(dev_ctx, + dev_ctx.GetPlace(), + &input, + &filter, + bias, + &out_grad, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + filter_grad, + input_grad, + unique_name); + + // create mkldnn memory from input tensors (data/weights) + auto& astream = OneDNNContext::tls().get_stream(); + + if (filter_grad) { + auto src_memory_p = + handler.AcquireSrcMemoryWithReorderFromWeightsPrimitive(&input); + auto diff_dst_memory_p = + handler.AcquireDiffDstMemoryWithReorderFromWeightsPrimitive( + &out_grad); + + // For convoluition with groups write filter grad into + // oneDNN buffer and then we reorder it into filter_grad tensor + int g = std::max(groups, 1); + auto diff_weights_memory_p = + g > 1 ? handler.AcquireDiffWeightsMemory() + : handler.AcquireDiffWeightsMemory(filter_grad); + + auto conv_bwd_weights_p = handler.AcquireBackwardWeightsPrimitive(); + + conv_bwd_weights_p->execute( + astream, + {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); + astream.wait(); + + // For convolution with groups convert from blocked to NCHW + // otherwise there will be problems in next operators working on + // this data + if (g > 1) { + // in OneDNN groups in convolution are treated as separate + // dimension which is not the case in paddlepaddle + + dnnl::memory::data_type in_type = + funcs::ToOneDNNDataType(filter.dtype()); + // for 3d conv with groups (six dimensional data reorder to + // goidhw) for 2d conv with groups (five dimensional data reorder + // to goihw) auto weights_tz = phi::vectorize(filter->dims()); + + auto weights_tz = diff_weights_memory_p->get_desc().dims(); + dnnl::memory::format_tag out_format = + weights_tz.size() == 6 ? dnnl::memory::format_tag::goidhw + : dnnl::memory::format_tag::goihw; + funcs::ReorderOneDNNHandler handler( + weights_tz, filter.dtype(), in_type, onednn_engine); + auto reorder_dst_memory_p = handler.AcquireDstMemory( + filter_grad, out_format, dev_ctx.GetPlace()); + + auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, + diff_weights_memory_p); + + { + reorder_p->execute( + astream, *diff_weights_memory_p, *reorder_dst_memory_p); + astream.wait(); + } + + // So here we have a data in goihw , which can be interpreted as + // OIHW (OIDHW for conv3d) because filter_grad shape is set for + // OIHW (OIDHW for conv3d) + dnnl::memory::format_tag target_format = + weights_tz.size() == 6 ? dnnl::memory::format_tag::oidhw + : dnnl::memory::format_tag::oihw; + filter_grad->set_mem_desc( + dnnl::memory::desc(phi::vectorize(filter_grad->dims()), + in_type, + target_format)); + } else { + filter_grad->set_mem_desc(diff_weights_memory_p->get_desc()); + } + } + if (input_grad) { + auto weights_memory_p = + handler.AcquireWeightsMemoryWithReorderFromDataPrimitive( + &filter, groups, strides.size() == 3U); + + auto diff_dst_memory_p = + handler.AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive( + &out_grad); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(input_grad); + + auto conv_bwd_data_p = handler.AcquireBackwardPrimitive(); + + conv_bwd_data_p->execute(astream, + {{DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); + + input_grad->set_mem_desc(diff_src_memory_p->get_desc()); + } + })); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_grad, + OneDNN, + ONEDNN, + phi::ConvGradKernel, + float, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h new file mode 100644 index 0000000000000..723784a845c2d --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -0,0 +1,763 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/onednn/onednn_helper.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/expect.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + +namespace phi { +namespace onednn { + +inline funcs::OneDNNMemoryFormat GetWeightsFormat(int groups, bool is_conv3d) { + if (is_conv3d) { + return (groups == 1) ? funcs::OneDNNMemoryFormat::oidhw + : funcs::OneDNNMemoryFormat::goidhw; + } else { + return (groups == 1) ? funcs::OneDNNMemoryFormat::oihw + : funcs::OneDNNMemoryFormat::goihw; + } +} + +template +class ConvOneDNNHandlerT + : public funcs::OneDNNHandlerT { + public: + ConvOneDNNHandlerT(const OneDNNContext& dev_ctx, + const dnnl::engine mkldnn_engine, + Place cpu_place, + const phi::DenseTensor* input, + const phi::DenseTensor* filter, + const phi::DenseTensor* bias, + const std::vector& strides_in, + const std::vector& paddings_in, + const std::string& padding_algorithm, + const std::vector& dilations_in, + int groups, + const std::string& data_format, + bool is_test, + bool is_BFLOAT16, + const std::string& fuse_activation, + bool fuse_residual_conn, + bool force_fp32_output, + phi::DenseTensor* output, + const std::string& unique_name) + : funcs::OneDNNHandlerT( + dev_ctx, + mkldnn_engine, + cpu_place, + funcs::CreateKey( + dev_ctx, phi::vectorize(input->dims()), unique_name)) { + if (unlikely(!this->isCached())) { + PADDLE_ENFORCE_EQ( + input->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The input tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + input->layout())); + + PADDLE_ENFORCE_EQ( + filter->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The Filter tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + filter->layout())); + + PADDLE_ENFORCE_GE( + input->dims().size(), + 4, + phi::errors::InvalidArgument( + "Input must be with 4 or 5 dimensions, i.e. NCHW or " + "NCDHW, but got dimension = %d .", + input->dims().size())); + PADDLE_ENFORCE_LE( + input->dims().size(), + 5, + phi::errors::InvalidArgument( + "Input must be with 4 or 5 dimensions, i.e. NCHW or " + "NCDHW, but got dimension = %d .", + input->dims().size())); + + PADDLE_ENFORCE_GE( + filter->dims().size(), + 4, + phi::errors::InvalidArgument( + "Filter must be with 4 or 5 dimensions, i.e. OIHW or " + "OIDHW, but got dimension = %d .", + filter->dims().size())); + PADDLE_ENFORCE_LE( + filter->dims().size(), + 5, + phi::errors::InvalidArgument( + "Filter must be with 4 or 5 dimensions, i.e. OIHW or " + "OIDHW, but got dimension = %d .", + filter->dims().size())); + + if (bias) { + PADDLE_ENFORCE_EQ( + bias->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The Bias tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + bias->layout())); + + PADDLE_ENFORCE_EQ( + bias->dims().size(), + 1, + phi::errors::InvalidArgument("Bias must only have 1 dimension, " + "i.e. X, but got dimension = %d .", + bias->dims().size())); + } + const auto input_dims = input->dims(); + const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); + const auto filter_dims = filter->dims(); + const auto filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + const auto ksize = phi::vectorize(filter_data_dims); + std::vector strides(begin(strides_in), end(strides_in)); + std::vector paddings(begin(paddings_in), end(paddings_in)); + std::vector dilations(begin(dilations_in), end(dilations_in)); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); + std::transform( + dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { + return i - 1; + }); + + const auto src_tz = phi::vectorize(input->dims()); + + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + const auto dst_tz = phi::vectorize(output->dims()); + + const dnnl::memory::dims stride_dims = strides; + const auto onednn_paddings = funcs::ToOnednnPadding(paddings); + const dnnl::memory::dims dilations_dims = dilations; + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto chosen_memory_format = funcs::OneDNNMemoryFormat::any; + auto data_type = dnnl::memory::data_type::f32; + if (is_BFLOAT16 || std::is_same::value) { + data_type = dnnl::memory::data_type::bf16; + } + + dnnl::memory::desc src_md, weights_md; + if (funcs::is_int8()) { + src_md = funcs::OneDNNMemDesc(src_tz, + funcs::ToOneDNNDataType(input->dtype()), + chosen_memory_format); + weights_md = funcs::OneDNNMemDesc( + weights_tz, dnnl::memory::data_type::s8, chosen_memory_format); + } else { + src_md = funcs::OneDNNMemDesc(src_tz, data_type, chosen_memory_format); + weights_md = funcs::OneDNNMemDesc( + weights_tz, data_type, funcs::OneDNNMemoryFormat::any); + } + + const auto dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference + : dnnl::prop_kind::forward_training; + const dnnl::primitive_attr conv_attr = CreateConvAttrs(filter, + groups, + force_fp32_output, + fuse_residual_conn, + fuse_activation); + + if (bias) { + auto bias_tz = phi::vectorize(bias->dims()); + dnnl::memory::desc bias_md; + if (funcs::is_int8()) { + bias_md = funcs::OneDNNMemDesc(bias_tz, + dnnl::memory::data_type::s32, + funcs::OneDNNMemoryFormat::x); + } else { + bias_md = funcs::OneDNNMemDesc( + bias_tz, data_type, funcs::OneDNNMemoryFormat::x); + } + + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + fwd_prop_kind, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + bias_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + fwd_prop_kind, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } + } + } + + ConvOneDNNHandlerT(const OneDNNContext& dev_ctx, + Place cpu_place, + const phi::DenseTensor* in, + const phi::DenseTensor* filter, + const phi::DenseTensor* bias, + const phi::DenseTensor* out_grad, + const std::vector& strides_in, + const std::vector& paddings_in, + const std::string& padding_algorithm, + const std::vector& dilations_in, + int groups, + const std::string& data_format, + bool is_test, + phi::DenseTensor* filter_grad, + phi::DenseTensor* in_x_grad, + const std::string& unique_name) + : funcs::OneDNNHandlerT( + dev_ctx, + dev_ctx.GetEngine(), + cpu_place, + funcs::CreateKey( + dev_ctx, phi::vectorize(in->dims()), unique_name)) { + if (unlikely(!this->isBwdCached())) { + PADDLE_ENFORCE_EQ( + in->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The input tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + in->layout())); + + PADDLE_ENFORCE_EQ( + filter->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The filter tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + filter->layout())); + + PADDLE_ENFORCE_EQ( + out_grad->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The output_grad tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + out_grad->layout())); + + PADDLE_ENFORCE_EQ( + is_test, + false, + phi::errors::InvalidArgument( + "is_test attribute should be set to False in training phase.")); + + std::vector strides(begin(strides_in), end(strides_in)); + std::vector paddings(begin(paddings_in), end(paddings_in)); + std::vector dilations(begin(dilations_in), end(dilations_in)); + + auto input_dims = in->dims(); + auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); + auto filter_dims = filter->dims(); + auto filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + auto ksize = phi::vectorize(filter_data_dims); + + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); + + auto src_tz = phi::vectorize(in->dims()); + auto weights_tz = phi::vectorize(filter->dims()); + + int g = std::max(groups, 1); + funcs::GetGroupConvWeightsTz(weights_tz, g); + auto dst_tz = phi::vectorize(out_grad->dims()); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + const auto chosen_memory_format = funcs::OneDNNMemoryFormat::any; + const auto weights_format = funcs::OneDNNMemoryFormat::any; + + auto src_md = funcs::OneDNNMemDesc( + src_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + const auto dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + auto diff_src_md = funcs::OneDNNMemDesc( + src_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + auto weights_md = funcs::OneDNNMemDesc( + weights_tz, funcs::OneDNNGetDataType(), weights_format); + auto diff_weights_md = funcs::OneDNNMemDesc( + weights_tz, funcs::OneDNNGetDataType(), weights_format); + auto diff_dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + + auto onednn_paddings = funcs::ToOnednnPadding(paddings); + std::transform( + dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { + return i - 1; + }); + const dnnl::memory::dims dilations_dims = dilations; + + const dnnl::memory::dims stride_dims = strides; + // Recreating FWD PD. For training there are no post ops in convolution + dnnl::primitive_attr conv_attr; + if (bias) { + auto bias_tz = phi::vectorize(bias->dims()); + dnnl::memory::desc bias_md; + if (funcs::is_int8()) { + bias_md = funcs::OneDNNMemDesc(bias_tz, + dnnl::memory::data_type::s32, + funcs::OneDNNMemoryFormat::x); + } else { + bias_md = funcs::OneDNNMemDesc(bias_tz, + dnnl::memory::data_type::f32, + funcs::OneDNNMemoryFormat::x); + } + + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + dnnl::prop_kind::forward_training, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + bias_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + dnnl::prop_kind::forward_training, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } + + this->AcquireBackwardPrimitiveDescriptor( + dnnl::algorithm::convolution_direct, + diff_src_md, + weights_md, + diff_dst_md, + strides, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + + this->AcquireBackwardWeightsPrimitiveDescriptor( + dnnl::algorithm::convolution_direct, + src_md, + diff_weights_md, + diff_dst_md, + strides, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } + } + + std::shared_ptr>> get_int8_bias_scales( + const DenseTensor* filter, + int groups, + const std::vector& scale_weights_data) { + // Get scales int8 bias key + const std::string key_bs = this->key_ + "@bs"; + + // Scales for int8 bias are to be cached to avoid + // computing them each iteration + groups = std::max(groups, 1); + auto bias_scale_tuple = + std::static_pointer_cast>>( + this->dev_ctx_.GetBlob(key_bs)); + if (bias_scale_tuple) return bias_scale_tuple; + + const auto& weights_tz = phi::vectorize(filter->dims()); + + const auto& scale_in_data = + this->dev_ctx_.HasDnnAttr("Scale_in") + ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_in")) + : 1.0f; + + bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + + int count = 1; + if (is_multi_channel) { + count *= weights_tz[0]; + if (groups > 1) { + count *= weights_tz[1]; + } + } + + bias_scale_tuple = + std::make_shared>>(std::make_tuple( + static_cast(mask_reorder), std::vector(count))); + for (int i = 0; i < count; i++) { + std::get<1>(*bias_scale_tuple)[i] = scale_in_data * scale_weights_data[i]; + } + + this->dev_ctx_.SetBlob(key_bs, bias_scale_tuple); + + return bias_scale_tuple; + } + + std::tuple, float> get_int8_scales( + const DenseTensor* filter, + int groups, + bool force_fp32_output, + bool fuse_residual_conn, + const std::string& fuse_activation) const { + const auto& weights_tz = phi::vectorize(filter->dims()); + groups = std::max(groups, 1); + + const auto& scale_weights_data = + this->dev_ctx_.HasDnnAttr("Scale_weights") + ? PADDLE_GET_CONST(std::vector, + this->dev_ctx_.GetDnnAttr("Scale_weights")) + : std::vector{1.0f}; + const auto& scale_in_data = + this->dev_ctx_.HasDnnAttr("Scale_in") + ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_in")) + : 1.0f; + const auto& scale_in_eltwise_data = + this->dev_ctx_.HasDnnAttr("Scale_in_eltwise") + ? PADDLE_GET_CONST(float, + this->dev_ctx_.GetDnnAttr("Scale_in_eltwise")) + : 1.0f; + + bool is_multi_channel = scale_weights_data.size() > 1; + bool has_activation = !fuse_activation.empty(); + const auto& scale_out = + this->dev_ctx_.HasDnnAttr("Scale_out") + ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_out")) + : 1.0f; + float activation_scale = + (!force_fp32_output && has_activation) ? scale_out : 1.0f; + + float scale_out_data = + (force_fp32_output || has_activation) ? 1.0f : scale_out; + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + // weights data will contain 0 in some models, then weights + // scale couldn't be calculated + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + + return std::make_tuple(sum_scale, output_shift_scale, activation_scale); + } + + dnnl::primitive_attr CreateConvAttrs(const DenseTensor* filter, + int groups, + bool force_fp32_output, + bool fuse_residual_conn, + const std::string& fuse_activation) { + dnnl::primitive_attr conv_attr; + dnnl::post_ops post_operations; + + float sum_scale = 1.0f; + float activation_scale = 1.0f; + std::vector output_shift_scale; + if (funcs::is_int8()) { + if (this->dev_ctx_.HasDnnAttr("Sum_scale")) { + sum_scale = + PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Sum_scale")); + activation_scale = + this->dev_ctx_.HasDnnAttr("Activation_scale") + ? PADDLE_GET_CONST( + float, this->dev_ctx_.GetDnnAttr("Activation_scale")) + : activation_scale; + output_shift_scale = + this->dev_ctx_.HasDnnAttr("Output_shift_scale") + ? PADDLE_GET_CONST( + std::vector, + this->dev_ctx_.GetDnnAttr("Output_shift_scale")) + : output_shift_scale; + } else { + std::tie(sum_scale, output_shift_scale, activation_scale) = + get_int8_scales(filter, + groups, + force_fp32_output, + fuse_residual_conn, + fuse_activation); + } + + if (output_shift_scale.size() > 0) { + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + } + } + + // Fusion with Elementwise layer relies on adding a sum post-operation with + // the scale parameter. It is assumed that when fuse_residual_connection is + // true, the output tensor contains the data coming from residual + // connection. The result of this post_op is: + // Output = scale * Output + Conv_Out. + if (fuse_residual_conn) { + post_operations.append_sum(sum_scale); + } + + funcs::AppendActivation(this->dev_ctx_, post_operations, activation_scale); + + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + + std::shared_ptr + AcquireWeightsMemoryWithReorderFromDataPrimitive( + const phi::DenseTensor* filter, const int groups, const bool is_conv3d) { + const K* filter_data = filter->data(); + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); + + return this->AcquireMemoryWithReorder(user_src_md, + this->bwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + "@weights_mem_d_p", + false); + } + + std::shared_ptr AcquireSrcMemoryWithReorder( + const phi::DenseTensor* input) { + return this->AcquireMemoryWithReorderPrimitive(input, + "@src_mem_p_user", + "@src_mem_p_target", + "@src_mem_p", + this->fwd_pd_->src_desc()); + } + + std::shared_ptr AcquireSrcMemoryWithReorderFromWeightsPrimitive( + const phi::DenseTensor* input) { + return this->AcquireMemoryWithReorderPrimitive(input, + "@src_mem_w_p_user", + "@src_mem_w_p_target", + "@src_mem_w_p", + this->bwd_w_pd_->src_desc()); + } + + std::shared_ptr + AcquireDiffDstMemoryWithReorderFromWeightsPrimitive( + const phi::DenseTensor* out_grad) { + return this->AcquireMemoryWithReorderPrimitive( + out_grad, + "@diff_dst_mem_w_p_user", + "@diff_dst_mem_w_p_target", + "@diff_dst_mem_w_p", + this->bwd_w_pd_->diff_dst_desc()); + } + + std::shared_ptr + AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive( + const phi::DenseTensor* out_grad) { + return this->AcquireMemoryWithReorderPrimitive( + out_grad, + "@diff_dst_mem_p_user", + "@diff_dst_mem_p_target", + "@diff_dst_mem_p", + this->bwd_pd_->diff_dst_desc()); + } + + std::shared_ptr AcquireMemoryWithReorderPrimitive( + const phi::DenseTensor* in_mem, + const char* key_mem_user, + const char* key_mem_target, + const char* key_mem, + const dnnl::memory::desc& mem_md) { + const T* in_mem_data = in_mem->data(); + const std::string user_key_suffix{key_mem_user}; + auto user_mem_p = this->AcquireMemory(user_key_suffix); + + if (!user_mem_p) { + return this->AcquireMemoryWithReorder(in_mem->mem_desc(), + mem_md, + funcs::to_void_cast(in_mem_data), + key_mem); + } else { + const std::string target_key_suffix{key_mem_target}; + const auto target_mem_p = this->AcquireMemory(target_key_suffix); + user_mem_p->set_data_handle(funcs::to_void_cast(in_mem_data)); + if (user_mem_p != target_mem_p) { + this->AcquireReorder(user_mem_p, target_mem_p); + } + return target_mem_p; + } + } + + std::shared_ptr AcquireWeightsMemoryWithReorder( + const phi::DenseTensor* filter, + const int groups, + const bool is_conv3d, + const bool is_test, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + // This is workaround to make execution faster, delete + // if statement after including md inside Tensor + auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); + if (is_test && weights_mem_p) { + return weights_mem_p; + } else if (is_test) { + const K* filter_data = filter->data(); + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); + + return this->AcquireMemoryWithReorder(user_src_md, + this->fwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + "@weights_mem_p", + is_test, + {}, + scale_data, + mask); + } else { + const T* filter_data = filter->data(); + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); + + return this->AcquireMemoryWithReorder(user_src_md, + this->fwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + "@weights_mem_p", + is_test, + {}, + scale_data, + mask); + } + } + + std::shared_ptr AcquireBiasMemoryWithReorder( + const phi::DenseTensor* bias, + const bool is_test, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); + if (is_test && bias_mem_p) { + return bias_mem_p; + } else { + // if K is int8 (weights are int8) then biases are int32 + using K_Bias = typename std:: + conditional::value, int32_t, K>::type; + if (std::is_same::value && + bias->dtype() != phi::DataType::INT32) { + LOG(ERROR) << "Bias should be of type int32 but is " << bias->dtype(); + } + const K_Bias* bias_data = bias->data(); + + return this->AcquireMemoryWithReorder( + bias->mem_desc(), + this->fwd_pd_->bias_desc(), + funcs::to_void_cast(bias_data), + "@bias_mem_p", + is_test, + {}, + scale_data, + mask); + } + } + + std::shared_ptr AcquireResidualMemory( + const phi::DenseTensor* residual_param) { + void* residual_data = + residual_param->dtype() == + paddle::experimental::CppTypeToDataType::Type() + ? funcs::to_void_cast(residual_param->data()) + : funcs::to_void_cast(residual_param->data()); + auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p"); + if (residual_mem_p) { + residual_mem_p->set_data_handle(residual_data); + return residual_mem_p; + } else { + return this->AcquireMemoryFromPrimitive(residual_param->mem_desc(), + residual_data, + "@user_residual_data_mem_p"); + } + } + + std::shared_ptr AcquireDstMemoryWithResidual( + phi::DenseTensor* output, const phi::DenseTensor* residual_param) { + std::shared_ptr dst_memory_p; + if (residual_param->mem_desc() != this->fwd_pd_->dst_desc()) { + auto residual_memory_p = this->AcquireResidualMemory(residual_param); + dst_memory_p = this->template AcquireDstMemory(output); + this->AcquireReorder(residual_memory_p, dst_memory_p); + } else { + // Changing ShareDataWith to TensorCopy results in performance drop + // on ResNet architectures + // (https://github.com/PaddlePaddle/Paddle/issues/22964) + output->ShareDataWith(*residual_param); + dst_memory_p = this->template AcquireDstMemory(output); + } + return dst_memory_p; + } +}; + +} // namespace onednn +} // namespace phi diff --git a/paddle/phi/kernels/onednn/conv_kernel.cc b/paddle/phi/kernels/onednn/conv_kernel.cc new file mode 100644 index 0000000000000..8dc8a9a66bdbe --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_kernel.cc @@ -0,0 +1,436 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/data_layout_transform.h" +#include "paddle/phi/kernels/onednn/conv_handler.h" + +namespace phi { + +static dnnl::memory::data_type GetDstType( + bool is_int8, + bool is_bfloat16, + bool force_fp32_output, + std::string fuse_activation, + bool fuse_residual_conn, + const phi::DenseTensor* residual_param) { + auto dst_dt = dnnl::memory::data_type::f32; + if (is_int8) { + dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6") + ? dnnl::memory::data_type::u8 + : dnnl::memory::data_type::s8; + if (force_fp32_output) { + dst_dt = dnnl::memory::data_type::f32; + } + if (fuse_residual_conn && residual_param) { + auto residual_dt = funcs::ToOneDNNDataType(residual_param->dtype()); + if (dst_dt != residual_dt) dst_dt = residual_dt; + } + } else { + if (!force_fp32_output && is_bfloat16) { + dst_dt = dnnl::memory::data_type::bf16; + if (fuse_residual_conn && residual_param) { + dst_dt = funcs::ToOneDNNDataType(residual_param->dtype()); + } + } + } + return dst_dt; +} + +#define PD_VISIT_FLOAT_AND_INT8_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +template +void ComputeFP32(const OneDNNContext& dev_ctx, + const DenseTensor* input, + const DenseTensor* filter, + const DenseTensor* bias, + const DenseTensor* residual_param, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + bool is_test, + bool is_BFLOAT16, + const std::string& fuse_activation, + bool fuse_residual_conn, + bool force_fp32_output, + DenseTensor* output) { + const auto& onednn_engine = dev_ctx.GetEngine(); + const bool is_conv3d = strides.size() == 3U; + const std::string& unique_name = + dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0]; + PD_VISIT_FLOAT_AND_INT8_TYPES( + filter->dtype(), "ConvOneDNNHandlerT", ([&] { + onednn::ConvOneDNNHandlerT handler(dev_ctx, + onednn_engine, + dev_ctx.GetPlace(), + input, + filter, + bias, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + output, + unique_name); + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, is_conv3d, is_test); + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + dst_memory_p = + handler.AcquireDstMemoryWithResidual(output, residual_param); + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } + + auto conv_p = handler.AcquireForwardPrimitive(); + std::unordered_map args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (bias) { + auto bias_memory_p = + handler.AcquireBiasMemoryWithReorder(bias, is_test); + args.insert({DNNL_ARG_BIAS, *bias_memory_p}); + } + + auto& astream = OneDNNContext::tls().get_stream(); + conv_p->execute(astream, args); + astream.wait(); + output->set_mem_desc(dst_memory_p->get_desc()); + })); +} + +template +void ComputeINT8(const OneDNNContext& dev_ctx, + const DenseTensor* input, + const DenseTensor* filter, + const DenseTensor* bias, + const DenseTensor* residual_param, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + bool is_test, + bool is_BFLOAT16, + const std::string& fuse_activation, + bool fuse_residual_conn, + bool force_fp32_output, + DenseTensor* output) { + const auto& onednn_engine = dev_ctx.GetEngine(); + const bool is_conv3d = strides.size() == 3U; + + bool unsigned_output = + (fuse_activation == "relu" || fuse_activation == "relu6"); + bool need_s8_to_u8 = false; + + PADDLE_ENFORCE_NE( + is_conv3d, + true, + phi::errors::Unimplemented( + "OneDNN int8 convolution does not support 3D inputs currently")); + PADDLE_ENFORCE_EQ( + fuse_residual_conn && force_fp32_output, + false, + phi::errors::Unimplemented( + "residual fusion does not support force output with fp32")); + const std::string& unique_name = + dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0]; + PD_VISIT_FLOAT_AND_INT8_TYPES( + filter->dtype(), "ConvMKLDNNHandlerT", ([&] { + onednn::ConvOneDNNHandlerT handler(dev_ctx, + onednn_engine, + dev_ctx.GetPlace(), + input, + filter, + bias, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + output, + unique_name); + + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); + + const auto& scale_weights_data = + dev_ctx.HasDnnAttr("Scale_weights") + ? PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("Scale_weights")) + : std::vector{1.0f}; + const bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel + ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) + : 0; + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, false, true, scale_weights_data, mask_reorder); + + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + PADDLE_ENFORCE_EQ( + output->dims(), + residual_param->dims(), + phi::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), + residual_param->dims().size())); + dst_memory_p = + handler.AcquireDstMemoryWithResidual(output, residual_param); + need_s8_to_u8 = (funcs::OneDNNGetDataType() == + dnnl::memory::data_type::s8) && + unsigned_output; + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } + + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (bias) { + std::vector bias_scales; + auto p_scales_tuple = + std::make_shared>>( + std::make_tuple(static_cast(mask_reorder), + bias_scales)); + if (dev_ctx.HasDnnAttr("Bias_scales")) { + bias_scales = PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("Bias_scales")); + p_scales_tuple = + std::make_shared>>( + std::make_tuple(static_cast(mask_reorder), + bias_scales)); + } else { + p_scales_tuple = handler.get_int8_bias_scales( + filter, groups, scale_weights_data); + } + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( + bias, + true, + std::get<1>(*p_scales_tuple), + std::get<0>(*p_scales_tuple)); + args.insert({DNNL_ARG_BIAS, *bias_memory_p}); + } + + auto& astream = OneDNNContext::tls().get_stream(); + conv_p->execute(astream, args); + astream.wait(); + + if (need_s8_to_u8) { + dev_ctx.Alloc(output); + } + + output->set_mem_desc(dst_memory_p->get_desc()); + })); +} + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + dev_ctx.GetPlace().GetType(), + AllocationType::CPU, + phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace")); + bool is_INT8 = + std::is_same::value || std::is_same::value; + + bool is_test = dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false; + bool is_BFLOAT16 = + dev_ctx.HasDnnAttr("mkldnn_data_type") + ? PADDLE_GET_CONST(std::string, + dev_ctx.GetDnnAttr("mkldnn_data_type")) == + "bfloat16" + : false; + const auto* bias = + dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr; + const auto* residual_param = dev_ctx.HasDnnInput("ResidualData") + ? dev_ctx.GetDnnInput("ResidualData") + : nullptr; + bool fuse_residual_conn = + dev_ctx.HasDnnAttr("fuse_residual_connection") + ? PADDLE_GET_CONST(bool, + dev_ctx.GetDnnAttr("fuse_residual_connection")) + : false; + const std::string& fuse_activation = + dev_ctx.HasDnnAttr("fuse_activation") + ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation")) + : ""; + bool force_fp32_output = + dev_ctx.HasDnnAttr("force_fp32_output") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) + : false; + auto dst_dt = GetDstType(is_INT8, + is_BFLOAT16, + force_fp32_output, + fuse_activation, + fuse_residual_conn, + residual_param); + if (!is_INT8) { + if (dst_dt == dnnl::memory::data_type::f32) { + ComputeFP32(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } else if (dst_dt == dnnl::memory::data_type::bf16) { + ComputeFP32(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } + } else { + if (dst_dt == dnnl::memory::data_type::f32) { + ComputeINT8(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } else if (dst_dt == dnnl::memory::data_type::u8) { + ComputeINT8(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } else if (dst_dt == dnnl::memory::data_type::s8) { + ComputeINT8(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, + OneDNN, + ONEDNN, + phi::ConvKernel, + float, + phi::dtype::bfloat16, + uint8_t, + int8_t) {} diff --git a/paddle/phi/kernels/xpu/conv_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_grad_kernel.cc index da86474b96b65..ad97d86e916fa 100644 --- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc @@ -28,12 +28,9 @@ void ConvGradKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { using XPUT = typename XPUTypeTrait::Type; @@ -151,7 +148,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -167,13 +164,10 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc index 287faf4cdb9aa..05f5f939187c4 100644 --- a/paddle/phi/kernels/xpu/conv_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_kernel.cc @@ -27,12 +27,9 @@ void ConvKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out) { using XPUT = typename XPUTypeTrait::Type; std::vector paddings = paddings_t; @@ -117,7 +114,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -131,13 +128,10 @@ void DepthwiseConvKernel(const Context& dev_ctx, filter, strides, paddings, - paddding_algorithm, - groups, + padding_algorithm, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, out); } diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 617c6e289bf2b..22ff9b3e1a834 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -17,31 +17,15 @@ namespace phi { KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (!ctx.HasAttr("use_addto") || !ctx.HasAttr("workspace_size_MB") || - !ctx.HasAttr("exhaustive_search")) { - return KernelSignature("conv2d_infer", - {"Input", "Filter"}, - {"strides", - "paddings", - "padding_algorithm", - "groups", - "dilations", - "data_format"}, - {"Output"}); - } else { - return KernelSignature("conv2d", - {"Input", "Filter"}, - {"strides", - "paddings", - "padding_algorithm", - "groups", - "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, - {"Output"}); - } + return KernelSignature("conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "dilations", + "groups", + "data_format"}, + {"Output"}); } KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { @@ -50,12 +34,9 @@ KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { {"strides", "paddings", "padding_algorithm", - "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + "groups", + "data_format"}, {"Input@GRAD", "Filter@GRAD"}); } @@ -66,12 +47,9 @@ KernelSignature Conv2dDoubleGradOpArgumentMapping( {"strides", "paddings", "padding_algorithm", - "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + "groups", + "data_format"}, {"DInput", "DFilter", "DDOutput"}); } diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index f93a031f7bc13..3231b18c8886e 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -289,12 +289,9 @@ def forward(self, input): self._stride, self._padding, "EXPLICIT", - self._groups if self._groups else 1, self._dilation, + self._groups if self._groups else 1, "NCHW", - False, - -1, - False, ) if self.bias is not None: pre_act = F.elementwise_add(pre_bias, self.bias, axis=1) diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 573594216177e..f022f5a8e38a3 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -137,12 +137,9 @@ def _conv_nd( stride, padding, padding_algorithm, - groups, dilation, + groups, data_format, - False, - -1, - False, ) if bias is not None: channel_dim = ( @@ -486,21 +483,33 @@ def conv1d( x = unsqueeze(x, axis=[squeeze_aixs]) if in_dygraph_mode(): - out = getattr(_C_ops, l_type)( - x, - weight, - stride, - padding, - padding_algorithm, - groups, - dilation, - conv2d_data_format, - False, - -1, - False, - False, - use_cudnn, - ) + if l_type == 'conv2d': + out = _C_ops.conv2d( + x, + weight, + stride, + padding, + padding_algorithm, + dilation, + groups, + conv2d_data_format, + ) + else: + out = getattr(_C_ops, l_type)( + x, + weight, + stride, + padding, + padding_algorithm, + groups, + dilation, + conv2d_data_format, + False, + -1, + False, + False, + use_cudnn, + ) if bias is not None: out = nn.elementwise_add(out, bias, axis=channel_dim) elif _in_legacy_dygraph(): @@ -746,12 +755,9 @@ def conv2d( stride, padding, padding_algorithm, - groups, dilation, + groups, data_format, - False, - -1, - False, ) if bias is not None: out = nn.elementwise_add(pre_bias, bias, axis=channel_dim) From c5d9913862e11814f9c59d24babcc98aeae61736 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 1 Nov 2022 11:06:17 +0800 Subject: [PATCH 49/91] Generate static graph code for some activation ops by Yaml (part2) (#47440) * gene static graph code for ceil, expm1 op * gene static graph code for some activation op * fix bug * revert doc of silu and logsigmoid --- paddle/fluid/operators/activation_op.cc | 277 -------------------- paddle/phi/api/yaml/backward.yaml | 142 +++++++++++ paddle/phi/api/yaml/legacy_backward.yaml | 143 ----------- paddle/phi/api/yaml/legacy_ops.yaml | 125 --------- paddle/phi/api/yaml/op_compat.yaml | 62 +++++ paddle/phi/api/yaml/ops.yaml | 125 +++++++++ paddle/phi/ops/compat/activation_sig.cc | 36 --- python/paddle/tensor/ops.py | 308 +++++++++++++++-------- 8 files changed, 537 insertions(+), 681 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index a444812ed99f8..6a239da553a58 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -146,26 +146,6 @@ Sigmoid Activation )DOC"; -UNUSED constexpr char SiluDoc[] = R"DOC( -Silu Activation Operator - -$$out = x * \\frac{1}{1 + e^{-x}}$$ -)DOC"; - -UNUSED constexpr char LogSigmoidDoc[] = R"DOC( -Logsigmoid Activation Operator - -$$out = \\log \\frac{1}{1 + e^{-x}}$$ - -)DOC"; - -UNUSED constexpr char Expm1Doc[] = R"DOC( -Expm1 Operator. Computes expm1 of x element-wise with a natural number :math:`e` as the base. - -$$out = e^x - 1$$ - -)DOC"; - UNUSED constexpr char ReluDoc[] = R"DOC( Relu Activation Operator. @@ -206,43 +186,6 @@ Please make sure input is legal in case of numeric errors. )DOC"; -UNUSED constexpr char CeilDoc[] = R"DOC( -Ceil Operator. Computes ceil of x element-wise. - -.. math:: - out = \left \lceil x \right \rceil - -)DOC"; - -UNUSED constexpr char FloorDoc[] = R"DOC( -Floor Activation Operator. Computes floor of x element-wise. - -$$out = \\lfloor x \\rfloor$$ - -)DOC"; - -UNUSED constexpr char RoundDoc[] = R"DOC( -The OP rounds the values in the input to the nearest integer value. - -.. code-block:: text - - input: - x.shape = [4] - x.data = [1.2, -0.9, 3.4, 0.9] - - output: - out.shape = [4] - out.data = [1., -1., 3., 1.] - -)DOC"; - -UNUSED constexpr char ReciprocalDoc[] = R"DOC( -Reciprocal Activation Operator. - -$$out = \\frac{1}{x}$$ - -)DOC"; - UNUSED constexpr char LogDoc[] = R"DOC( Log Activation Operator. @@ -252,33 +195,6 @@ Natural logarithm of x. )DOC"; -UNUSED constexpr char Log2Doc[] = R"DOC( -Log2 Activation Operator. - -$$out = \log_2x$$ - -logarithm of x base to 2. - -)DOC"; - -UNUSED constexpr char Log10Doc[] = R"DOC( -Log10 Activation Operator. - -$$out = \log_10_x$$ - -logarithm of x base to 10. - -)DOC"; - -UNUSED constexpr char Log1pDoc[] = R"DOC( -Log Activation Operator. - -$out = \ln(x+1)$ - -Natural logarithm of x. - -)DOC"; - UNUSED constexpr char SquareDoc[] = R"DOC( The OP square each elements of the inputs. @@ -356,28 +272,6 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { } }; -class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "Input of HardShrink operator"); - AddOutput("Out", "Output of HardShrink operator"); - AddAttr("threshold", - "The value of threshold for HardShrink. [default: 0.5]") - .SetDefault(0.5f); - AddComment(R"DOC( -:strong:`HardShrink activation operator` - -.. math:: - out = \begin{cases} - x, \text{if } x > \lambda \\ - x, \text{if } x < -\lambda \\ - 0, \text{otherwise} - \end{cases} - -)DOC"); - } -}; - class BReluOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -454,39 +348,6 @@ class ELUGradOpMaker : public framework::SingleGradOpMaker { } }; -class LogitOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "Input of Logit operator"); - AddOutput("Out", "Output of Logit operator"); - AddAttr("eps", - "(float, default 1e-6f) the epsilon for input clamp bound") - .SetDefault(1e-6f); - AddComment(R"DOC( -Logit Operator. - -this function is defined as follow: -$ logit=ln\left ( {\frac {x} {1-x}} \right ) $ - -)DOC"); - } -}; - -template -class LogitGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("logit_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - class CELUOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -591,31 +452,6 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { } }; -class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "An N-D Tensor with data type float32, float64. "); - AddOutput("Out", "A Tensor with the same shape as input. "); - AddAttr("slope", - "The slope of the linear approximation of sigmoid. Its " - "value MUST BE positive. Default is 0.2. ") - .SetDefault(0.2f); - AddAttr( - "offset", - "The offset of the linear approximation of sigmoid. Default is 0.5. ") - .SetDefault(0.5f); - AddComment(R"DOC( -HardSigmoid Activation Operator. - -A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), -which is much faster than sigmoid. - -$$out = \max(0, \min(1, slope * x + offset))$$ - -)DOC"); - } -}; - class SwishOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -684,22 +520,12 @@ It is recommended to use the defaults for this activation. }; REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); -REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc); -REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); -REGISTER_ACTIVATION_OP_MAKER(Expm1, Expm1Doc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc); -REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc); -REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc); -REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc); -REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc); REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); -REGISTER_ACTIVATION_OP_MAKER(Log2, Log2Doc); -REGISTER_ACTIVATION_OP_MAKER(Log10, Log10Doc); -REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc); REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc); REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); @@ -1093,73 +919,6 @@ DECLARE_INPLACE_OP_INFERER(ActivationDoubleGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ActivationTripleGradOpInplaceInferer, {"DDX", "D_DOut"}); -class LogitOp : public framework::OperatorWithKernel { - public: - LogitOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), - true, - platform::errors::InvalidArgument( - "Input(%s) of LogitOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), - true, - platform::errors::InvalidArgument( - "Output(%s) of LogitOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - framework::LibraryType library{framework::LibraryType::kPlain}; - phi::DataLayout layout = phi::DataLayout::kAnyLayout; - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - - return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); - } -}; - -class LogitGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Out")), - true, - platform::errors::InvalidArgument( - "Input(%s) of LogitGradOp should not be null.", "DOut")); - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), - true, - platform::errors::InvalidArgument( - "Input(%s) of LogitGradOp should not be null.", "X")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput(framework::GradVarName("X")), - true, - platform::errors::InvalidArgument( - "Output(%s) of LogitGradOp should not be null.", "DX")); - auto x_grad_name = framework::GradVarName("X"); - ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - framework::LibraryType library{framework::LibraryType::kPlain}; - phi::DataLayout layout = phi::DataLayout::kAnyLayout; - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); - } -}; - template class PowGradOpMaker : public framework::SingleGradOpMaker { public: @@ -1273,10 +1032,6 @@ REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); REGISTER_ACTIVATION_OP(relu6, Relu6, Relu6Functor, Relu6GradFunctor); -REGISTER_ACTIVATION_OP(hard_shrink, - HardShrink, - HardShrinkFunctor, - HardShrinkGradFunctor); REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, @@ -1285,42 +1040,21 @@ REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); -REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); REGISTER_ACTIVATION_OP(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); -REGISTER_ACTIVATION_OP(hard_sigmoid, - HardSigmoid, - HardSigmoidFunctor, - HardSigmoidGradFunctor); -REGISTER_ACTIVATION_OP(logsigmoid, - LogSigmoid, - LogSigmoidFunctor, - LogSigmoidGradFunctor); -REGISTER_ACTIVATION_OP(expm1, Expm1, Expm1Functor, Expm1GradFunctor); REGISTER_ACTIVATION_OP(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor); REGISTER_ACTIVATION_OP(stanh, STanh, STanhFunctor, STanhGradFunctor); -REGISTER_ACTIVATION_OP(reciprocal, - Reciprocal, - ReciprocalFunctor, - ReciprocalGradFunctor); - -REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor); -REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor); -REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); REGISTER_ACTIVATION_OP(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); REGISTER_ACTIVATION_OP(swish, Swish, SwishFunctor, SwishGradFunctor); -REGISTER_ACTIVATION_OP(round, Round, RoundFunctor, ZeroGradFunctor); -REGISTER_ACTIVATION_OP(floor, Floor, FloorFunctor, ZeroGradFunctor); -REGISTER_ACTIVATION_OP(ceil, Ceil, CeilFunctor, ZeroGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1459,17 +1193,6 @@ REGISTER_OPERATOR( /* ========================================================================== */ -/* ======================== logit register ============================ - */ -REGISTER_OPERATOR(logit, - ops::LogitOp, - ops::LogitOpMaker, - ops::LogitGradOpMaker, - ops::LogitGradOpMaker); -REGISTER_OPERATOR(logit_grad, ops::LogitGradOp); - -/* ========================================================================== */ - /* ======================== celu register ============================ */ REGISTER_OPERATOR( diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index b13bd97a5a6a5..a3611bcca3477 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -106,6 +106,17 @@ func : bmm_grad data_type : out_grad +- backward_op : ceil_grad + forward : ceil(Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [out_grad] + kernel : + func : ceil_grad + inplace : (out_grad -> x_grad) + - backward_op : cholesky_grad forward : cholesky (Tensor x, bool upper) -> Tensor(out) args : (Tensor out, Tensor out_grad, bool upper) @@ -257,6 +268,17 @@ func : exp_grad inplace : (out_grad -> x_grad) +- backward_op : expm1_grad + forward : expm1 (Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : expm1_grad + inplace : (out_grad -> x_grad) + - backward_op : fft_c2c_grad forward: fft_c2c(Tensor x, int64_t[] axes, str normalization, bool forward) -> Tensor(out) args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward) @@ -295,6 +317,39 @@ output : Tensor(x_grad) invoke : flip(out_grad, axis) +- backward_op : floor_grad + forward : floor(Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [out_grad] + kernel : + func : floor_grad + inplace : (out_grad -> x_grad) + +- backward_op : hardshrink_grad + forward : hardshrink (Tensor x, float threshold) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float threshold) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_shrink_grad + inplace : (out_grad -> x_grad) + +- backward_op : hardsigmoid_grad + forward : hardsigmoid (Tensor x, float slope, float offset) -> Tensor(out) + args : (Tensor out, Tensor out_grad, float slope, float offset) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : hard_sigmoid_grad + inplace : (out_grad -> x_grad) + - backward_op : lgamma_grad forward : lgamma(Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -305,6 +360,60 @@ kernel : func : lgamma_grad +- backward_op : log10_grad + forward : log10 (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : log10_grad + inplace : (out_grad -> x_grad) + +- backward_op : log1p_grad + forward : log1p (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : log1p_grad + inplace : (out_grad -> x_grad) + +- backward_op : log2_grad + forward : log2 (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : log2_grad + inplace : (out_grad -> x_grad) + +- backward_op : logit_grad + forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float eps) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : logit_grad + +- backward_op : logsigmoid_grad + forward : logsigmoid (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : logsigmoid_grad + inplace : (out_grad -> x_grad) + - backward_op : mv_grad forward : mv (Tensor x, Tensor vec) -> Tensor(out) args : (Tensor x, Tensor vec, Tensor out_grad) @@ -325,6 +434,28 @@ kernel : func : poisson_grad +- backward_op : reciprocal_grad + forward : reciprocal (Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : reciprocal_grad + inplace : (out_grad -> x_grad) + +- backward_op : round_grad + forward : round(Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [out_grad] + kernel : + func : round_grad + inplace : (out_grad -> x_grad) + - backward_op : send_uv_grad forward : send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out) args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD") @@ -336,6 +467,17 @@ func : send_uv_grad data_type : x +- backward_op : silu_grad + forward : silu (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : silu_grad + inplace : (out_grad -> x_grad) + - backward_op : sin_grad forward : sin (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 7d9da75c23209..922cb70d6e7e1 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -217,17 +217,6 @@ invoke : cast (out_grad, x.dtype()) no_need_buffer : x -- backward_op : ceil_grad - forward : ceil(Tensor x) -> Tensor(out) - args : (Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [out_grad] - kernel : - func : ceil_grad - inplace : (out_grad -> x_grad) - - backward_op : celu_double_grad forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x) args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha) @@ -621,17 +610,6 @@ no_need_buffer : x backward : expand_double_grad -- backward_op : expm1_grad - forward : expm1 (Tensor x) -> Tensor(out) - args : (Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out] - kernel : - func : expm1_grad - inplace : (out_grad -> x_grad) - - backward_op : exponential__grad forward : exponential_ (Tensor x, float lam) -> Tensor(out) args : (Tensor out_grad) @@ -684,17 +662,6 @@ layout: out_grad inplace : (out_grad -> x_grad) -- backward_op : floor_grad - forward : floor(Tensor x) -> Tensor(out) - args : (Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [out_grad] - kernel : - func : floor_grad - inplace : (out_grad -> x_grad) - - backward_op : fmax_grad forward : fmax(Tensor x, Tensor y, int axis) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad, int axis) @@ -802,28 +769,6 @@ kernel : func : gumbel_softmax_grad -- backward_op : hardshrink_grad - forward : hardshrink (Tensor x, float threshold) -> Tensor(out) - args : (Tensor x, Tensor out_grad, float threshold) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : hard_shrink_grad - inplace : (out_grad -> x_grad) - -- backward_op : hardsigmoid_grad - forward : hardsigmoid (Tensor x, float slope, float offset) -> Tensor(out) - args : (Tensor out, Tensor out_grad, float slope, float offset) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out] - kernel : - func : hard_sigmoid_grad - inplace : (out_grad -> x_grad) - - backward_op : hardswish_grad forward : hardswish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out) args : (Tensor x, Tensor out_grad, float threshold, float scale, float offset) @@ -1040,39 +985,6 @@ func : linear_interp_grad data_type : output_grad -- backward_op : log10_grad - forward : log10 (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : log10_grad - inplace : (out_grad -> x_grad) - -- backward_op : log1p_grad - forward : log1p (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : log1p_grad - inplace : (out_grad -> x_grad) - -- backward_op : log2_grad - forward : log2 (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : log2_grad - inplace : (out_grad -> x_grad) - - backward_op : log_double_grad forward : log_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x) args : (Tensor x, Tensor grad_out, Tensor grad_x_grad) @@ -1126,27 +1038,6 @@ kernel : func : logcumsumexp_grad -- backward_op : logit_grad - forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out) - args : (Tensor x, Tensor out_grad, float eps) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : logit_grad - -- backward_op : logsigmoid_grad - forward : logsigmoid (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : logsigmoid_grad - inplace : (out_grad -> x_grad) - - backward_op : logsumexp_grad forward : logsumexp(Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis, bool keepdim, bool reduce_all) @@ -1625,17 +1516,6 @@ output : Tensor(x_grad) invoke : real_grad_impl(out_grad, x_grad) -- backward_op : reciprocal_grad - forward : reciprocal (Tensor x) -> Tensor(out) - args : (Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out] - kernel : - func : reciprocal_grad - inplace : (out_grad -> x_grad) - - backward_op : reduce_prod_grad forward : reduce_prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all) @@ -1803,17 +1683,6 @@ data_type : x no_need_buffer : x -- backward_op : round_grad - forward : round(Tensor x) -> Tensor(out) - args : (Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [out_grad] - kernel : - func : round_grad - inplace : (out_grad -> x_grad) - - backward_op : rsqrt_double_grad forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) args : (Tensor out, Tensor grad_x, Tensor grad_x_grad) @@ -1964,17 +1833,6 @@ output : Tensor(x_grad) invoke : scale(out_grad, 0.0, 0.0, true) -- backward_op : silu_grad - forward : silu (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : silu_grad - inplace : (out_grad -> x_grad) - - backward_op : slice_double_grad forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input) args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) @@ -2068,7 +1926,6 @@ args : (Tensor[] out_grad, Scalar axis = -1) output : Tensor(x_grad) invoke : concat( out_grad, axis) -# TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future. - backward_op : sqrt_double_grad forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index e6aa3b18f5f86..c42bc74461e5a 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -357,16 +357,6 @@ data_type : x backward : cast_grad -- op : ceil - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - kernel : - func : ceil - inplace : (x -> out) - backward : ceil_grad - - op : celu args : (Tensor x, float alpha) output : Tensor(out) @@ -757,16 +747,6 @@ optional : y backward : expand_as_grad -- op : expm1 - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : expm1 - backward : expm1_grad - - op : exponential_ args : (Tensor x, float lam) output : Tensor(out) @@ -834,16 +814,6 @@ intermediate : xshape backward : flatten_grad -- op : floor - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - kernel : - func : floor - inplace : (x -> out) - backward : floor_grad - - op : floor_divide args : (Tensor x, Tensor y) output : Tensor(out) @@ -1046,26 +1016,6 @@ func : gumbel_softmax backward : gumbel_softmax_grad -- op : hardshrink - args : (Tensor x, float threshold) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : hard_shrink - backward : hardshrink_grad - -- op : hardsigmoid - args : (Tensor x, float slope, float offset) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : hard_sigmoid - backward : hardsigmoid_grad - - op : hardswish args : (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) output : Tensor @@ -1359,33 +1309,6 @@ func : log backward: log_grad -- op : log10 - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : log10 - backward: log10_grad - -- op : log1p - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : log1p - backward: log1p_grad - -- op : log2 - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : log2 - backward: log2_grad - - op : log_loss args : (Tensor input, Tensor label, float epsilon) output : Tensor @@ -1445,25 +1368,6 @@ kernel : func : logical_xor -- op : logit - args : (Tensor x, float eps = 1e-6f) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : logit - backward : logit_grad - -- op : logsigmoid - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : logsigmoid - backward : logsigmoid_grad - - op : logsumexp args : (Tensor x, int64_t[] axis, bool keepdim, bool reduce_all) output : Tensor(out) @@ -1989,16 +1893,6 @@ func : real backward : real_grad -- op : reciprocal - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - kernel : - func : reciprocal - inplace : (x -> out) - backward : reciprocal_grad - - op : reduce_prod args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) output : Tensor @@ -2130,16 +2024,6 @@ func : roll backward : roll_grad -- op : round - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - kernel : - func : round - inplace : (x -> out) - backward : round_grad - - op : rsqrt args : (Tensor x) output : Tensor(out) @@ -2295,15 +2179,6 @@ func : sign backward : sign_grad -- op : silu - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : silu - backward : silu_grad - - op : slice args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) output : Tensor diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 533a9d9dc040b..2857beccb10d2 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -120,6 +120,10 @@ - op : ceil backward : ceil_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -347,6 +351,10 @@ - op : expm1 backward : expm1_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -398,6 +406,10 @@ - op : floor backward : floor_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -457,6 +469,20 @@ extra : attrs : [bool use_mkldnn = false] +- op : hardshrink (hard_shrink) + backward : hardshrink_grad (hard_shrink_grad) + inputs : + x : X + outputs : + out : Out + +- op : hardsigmoid (hard_sigmoid) + backward : hardsigmoid_grad (hard_sigmoid_grad) + inputs : + x : X + outputs : + out : Out + - op : heaviside (elementwise_heaviside) backward : heaviside_grad (elementwise_heaviside_grad) extra : @@ -496,16 +522,28 @@ - op : log10 backward : log10_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : log1p backward : log1p_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : log2 backward : log2_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -514,6 +552,18 @@ extra : attrs : [bool use_mkldnn = false] +- op : logit + inputs : + x : X + outputs : + out : Out + +- op : logsigmoid + inputs : + x : X + outputs : + out : Out + - op : logsigmoid backward : logsigmoid_grad extra : @@ -620,6 +670,10 @@ - op : reciprocal backward : reciprocal_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -688,6 +742,10 @@ - op : round backward : round_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] @@ -728,6 +786,10 @@ - op : silu backward : silu_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 39bdde76ca2a3..5fd80df6864cf 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -96,6 +96,16 @@ func : bmm backward : bmm_grad +- op : ceil + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : ceil + inplace : (x -> out) + backward : ceil_grad + - op : cholesky args : (Tensor x, bool upper=false) output : Tensor @@ -226,6 +236,16 @@ inplace : (x -> out) backward : exp_grad +- op : expm1 + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : expm1 + backward : expm1_grad + - op : fft_c2c args : (Tensor x, int64_t[] axes, str normalization, bool forward) output : Tensor @@ -262,6 +282,36 @@ func : flip backward : flip_grad +- op : floor + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : floor + inplace : (x -> out) + backward : floor_grad + +- op : hardshrink + args : (Tensor x, float threshold = 0.5) + output : Tensor (out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_shrink + backward : hardshrink_grad + +- op : hardsigmoid + args : (Tensor x, float slope = 0.2, float offset = 0.5) + output : Tensor (out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : hard_sigmoid + backward : hardsigmoid_grad + - op : lgamma args : (Tensor x) output : Tensor(out) @@ -271,6 +321,52 @@ func : lgamma backward : lgamma_grad +- op : log10 + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : log10 + backward: log10_grad + +- op : log1p + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : log1p + backward: log1p_grad + +- op : log2 + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : log2 + backward: log2_grad + +- op : logit + args : (Tensor x, float eps = 1e-6f) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : logit + backward : logit_grad + +- op : logsigmoid + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : logsigmoid + backward : logsigmoid_grad + - op : mv args : (Tensor x, Tensor vec) output : Tensor @@ -289,6 +385,26 @@ func : poisson backward : poisson_grad +- op : reciprocal + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : reciprocal + inplace : (x -> out) + backward : reciprocal_grad + +- op : round + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : round + inplace : (x -> out) + backward : round_grad + - op : send_uv args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") output : Tensor(out) @@ -299,6 +415,15 @@ data_type : x backward : send_uv_grad +- op : silu + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : silu + backward : silu_grad + - op : sin args : (Tensor x) output : Tensor diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 990790e4798ed..85e8f7c2de721 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -47,16 +47,10 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, "thresholded_relu", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softsign, "softsign", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log10, "log10", ); // NOLINT -DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log1p, "log1p", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Celu, "celu", "alpha"); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish, "hard_swish", @@ -75,15 +69,10 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus, DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Expm1, "expm1", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Reciprocal, "reciprocal", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold"); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid, - "hard_sigmoid", - "slope" comma "offset"); // NOLINT KernelSignature SqrtActiOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("X")) { return KernelSignature("sqrt", {"X"}, {}, {"Out"}); @@ -100,10 +89,6 @@ KernelSignature SquareActiOpArgumentMapping(const ArgumentMappingContext& ctx) { } } -DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Round, "round", ); // NOLINT -DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Floor, "floor", ); // NOLINT -DEFINE_ACT_GRAD_NODEP_OP_ARGMAP(Ceil, "ceil", ); // NOLINT - KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); @@ -151,10 +136,6 @@ KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); } -KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("logit_grad", {"X", "Out@GRAD"}, {"eps"}, {"X@GRAD"}); -} - KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature( "elu_grad", {"X", "Out", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); @@ -233,10 +214,7 @@ PD_REGISTER_BASE_KERNEL_NAME(brelu_grad, hard_tanh_grad); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad, - phi::ReciprocalGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sqrt_grad_grad, phi::SqrtDoubleGradOpArgumentMapping); @@ -265,40 +243,26 @@ PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, PD_REGISTER_ARG_MAPPING_FN(relu6_grad, phi::Relu6GradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(softshrink_grad, phi::SoftShrinkGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad, - phi::HardShrinkGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad, phi::TanhShrinkGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(softsign_grad, phi::SoftsignGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, phi::SigmoidDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad, phi::SigmoidTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, - phi::LogSigmoidGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, - phi::HardSigmoidGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtActiOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareActiOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad, phi::HardSwishGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(round_grad, phi::RoundGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(floor_grad, phi::FloorGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(ceil_grad, phi::CeilGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(pow_grad, phi::PowGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(pow, phi::PowOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(celu_grad, phi::CeluGradOpArgumentMapping); diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py index 3aa63e2e09387..4a5994076cf89 100644 --- a/python/paddle/tensor/ops.py +++ b/python/paddle/tensor/ops.py @@ -39,14 +39,9 @@ ] __unary_func__ = [ - 'expm1', 'sqrt', 'rsqrt', 'abs', - 'ceil', - 'floor', - 'round', - 'reciprocal', 'square', ] @@ -119,15 +114,12 @@ r""" Examples: .. code-block:: python - import paddle import paddle.nn.functional as F - x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) out = F.silu(x) print(out) # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ] - """, ) @@ -136,31 +128,12 @@ r""" Examples: .. code-block:: python - import paddle import paddle.nn.functional as F - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) out = F.log_sigmoid(x) print(out) # [-0.91301525 -0.79813887 -0.64439666 -0.55435524] - -""", -) - -add_sample_code( - globals()["expm1"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.expm1(x) - print(out) - # [-0.32967997, -0.18126924, 0.10517092, 0.34985882] - """, ) @@ -245,70 +218,6 @@ """, ) -add_sample_code( - globals()["ceil"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.ceil(x) - print(out) - # [-0. -0. 1. 1.] - -""", -) - -add_sample_code( - globals()["floor"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.floor(x) - print(out) - # [-1. -1. 0. 0.] - -""", -) - -add_sample_code( - globals()["round"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5]) - out = paddle.round(x) - print(out) - # [-1. -0. 1. 2.] - -""", -) - -add_sample_code( - globals()["reciprocal"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.reciprocal(x) - print(out) - # [-2.5 -5. 10. 3.33333333] - -""", -) - add_sample_code( globals()["square"], r""" @@ -582,6 +491,44 @@ def atanh(x, name=None): return out +def ceil(x, name=None): + """ + + Ceil Operator. Computes ceil of x element-wise. + + .. math:: + out = \\left \\lceil x \\right \\rceil + + Args: + x (Tensor): Input of Ceil operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Ceil operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.ceil(x) + print(out) + # [-0. -0. 1. 1.] + + """ + if in_dygraph_mode(): + return _C_ops.ceil(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.ceil(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'ceil') + helper = LayerHelper('ceil', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='ceil', inputs={"X": x}, outputs={"Out": out}) + return out + + def cos(x, name=None): """ Cosine Operator. Computes cosine of x element-wise. @@ -627,18 +574,18 @@ def cosh(x, name=None): Input range `(-inf, inf)`, output range `(1, inf)`. - .. math:: - out = \frac{exp(x)+exp(-x)}{2} + .. math:: + out = \\frac{exp(x)+exp(-x)}{2} - Args: - x (Tensor): Input of Cosh operator, an N-D Tensor, with data type float32, float64 or float16. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + Args: + x (Tensor): Input of Cosh operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - Returns: - Tensor. Output of Cosh operator, a Tensor with shape same as input. + Returns: + Tensor. Output of Cosh operator, a Tensor with shape same as input. - Examples: - .. code-block:: python + Examples: + .. code-block:: python import paddle @@ -711,6 +658,167 @@ def exp(x, name=None): return out +def expm1(x, name=None): + """ + + Expm1 Operator. Computes expm1 of x element-wise with a natural number :math:`e` as the base. + + .. math:: + out = e^x - 1 + + Args: + x (Tensor): Input of Expm1 operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Expm1 operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.expm1(x) + print(out) + # [-0.32967997, -0.18126924, 0.10517092, 0.34985882] + + """ + if in_dygraph_mode(): + return _C_ops.expm1(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.expm1(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'expm1') + helper = LayerHelper('expm1', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='expm1', inputs={"X": x}, outputs={"Out": out}) + return out + + +def floor(x, name=None): + """ + + Floor Activation Operator. Computes floor of x element-wise. + + .. math:: + out = \\lfloor x \\rfloor + + Args: + x (Tensor): Input of Floor operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Floor operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.floor(x) + print(out) + # [-1. -1. 0. 0.] + + """ + if in_dygraph_mode(): + return _C_ops.floor(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.floor(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'floor') + helper = LayerHelper('floor', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='floor', inputs={"X": x}, outputs={"Out": out}) + return out + + +def reciprocal(x, name=None): + """ + + Reciprocal Activation Operator. + + .. math:: + out = \\frac{1}{x} + + Args: + x (Tensor): Input of Reciprocal operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Reciprocal operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.reciprocal(x) + print(out) + # [-2.5 -5. 10. 3.33333333] + + """ + if in_dygraph_mode(): + return _C_ops.reciprocal(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.reciprocal(x) + + check_variable_and_dtype( + x, 'x', ['float16', 'float32', 'float64'], 'reciprocal' + ) + helper = LayerHelper('reciprocal', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='reciprocal', inputs={"X": x}, outputs={"Out": out}) + return out + + +def round(x, name=None): + """ + + Round the values in the input to the nearest integer value. + + .. code-block:: text + + input: + x.shape = [4] + x.data = [1.2, -0.9, 3.4, 0.9] + + output: + out.shape = [4] + out.data = [1., -1., 3., 1.] + + Args: + x (Tensor): Input of Round operator, an N-D Tensor, with data type float32, float64 or float16. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor. Output of Round operator, a Tensor with shape same as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5]) + out = paddle.round(x) + print(out) + # [-1. -0. 1. 2.] + + """ + if in_dygraph_mode(): + return _C_ops.round(x) + if _in_legacy_dygraph(): + return _legacy_C_ops.round(x) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'round') + helper = LayerHelper('round', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op(type='round', inputs={"X": x}, outputs={"Out": out}) + return out + + def sin(x, name=None): """ Sine Activation Operator. From 96f36962ef285bf563ed288fec44451ed6ff5724 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Tue, 1 Nov 2022 11:21:10 +0800 Subject: [PATCH 50/91] remove unused-local-typedefs warning on linux (#47513) --- cmake/flags.cmake | 1 - paddle/fluid/operators/collective/c_comm_init_op.cc | 1 - .../fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h | 1 - paddle/phi/kernels/xpu/rnn_kernel.cc | 1 - 4 files changed, 4 deletions(-) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 6a0e1704bfa5d..39261a788bd18 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -149,7 +149,6 @@ if(NOT WIN32) -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix - -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 -Wno-error=terminate # Warning in PADDLE_ENFORCE -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index d3d7f8423053d..59fbd02b5c086 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -54,7 +54,6 @@ class CCommInitOp : public framework::OperatorBase { // TODO(wangxi): Put this in the unified header file #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) using UniqueId = ncclUniqueId; - using Place = platform::CUDAPlace; using CommContext = platform::NCCLCommContext; #elif defined(PADDLE_WITH_XPU_BKCL) using UniqueId = BKCLUniqueId; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index e1b829b03a4d4..7ab53437f8f4a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -143,7 +143,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - using Tensor = phi::DenseTensor; auto& dev_ctx = ctx.template device_context(); diff --git a/paddle/phi/kernels/xpu/rnn_kernel.cc b/paddle/phi/kernels/xpu/rnn_kernel.cc index 9465839043a6e..faa4dce29f77d 100644 --- a/paddle/phi/kernels/xpu/rnn_kernel.cc +++ b/paddle/phi/kernels/xpu/rnn_kernel.cc @@ -39,7 +39,6 @@ void RnnKernel(const Context& dev_ctx, DenseTensor* dropout_state, std::vector state, DenseTensor* reserve) { - using XPUTyp = typename XPUTypeTrait::Type; if (dropout_state->IsInitialized()) { if (dropout_state->numel() != out->numel()) dropout_state->clear(); } From bfc45e34b306ef5581c63ed69352940c4fff4e4e Mon Sep 17 00:00:00 2001 From: Vigi Zhang Date: Tue, 1 Nov 2022 11:45:44 +0800 Subject: [PATCH 51/91] add pdsa-2022-002 (#47486) --- security/README.md | 1 + security/README_cn.md | 1 + security/advisory/pdsa-2022-002.md | 33 +++++++++++++++++++++++++++ security/advisory/pdsa-2022-002_cn.md | 33 +++++++++++++++++++++++++++ 4 files changed, 68 insertions(+) create mode 100644 security/advisory/pdsa-2022-002.md create mode 100644 security/advisory/pdsa-2022-002_cn.md diff --git a/security/README.md b/security/README.md index cb01299927f77..eefde5344eb93 100644 --- a/security/README.md +++ b/security/README.md @@ -10,3 +10,4 @@ We regularly publish security advisories about using PaddlePaddle. | Advisory Number | Type | Versions affected | Reported by | Additional Information | |----------------------------------------------|-------------------------|:-----------------:|---------------------------------------|------------------------| | [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | +| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | diff --git a/security/README_cn.md b/security/README_cn.md index a91f5ce459681..1beba5c1fa729 100644 --- a/security/README_cn.md +++ b/security/README_cn.md @@ -10,3 +10,4 @@ | 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 | |-------------------------------------------------|-------------------------|:-----:|---------------------------------------|-----| | [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team | | +| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University | | diff --git a/security/advisory/pdsa-2022-002.md b/security/advisory/pdsa-2022-002.md new file mode 100644 index 0000000000000..efb8e931722bb --- /dev/null +++ b/security/advisory/pdsa-2022-002.md @@ -0,0 +1,33 @@ +## PDSA-2022-002: Code injection in paddle.audio.functional.get_window + +### Impact + +`paddle.audio.functional.get_windowis` vulnerable to a code injection as it calls `eval` on user supplied `winstr`. This may lead to arbitrary code execution. + +```python +def get_window( + window: Union[str, Tuple[str, float]], + win_length: int, + fftbins: bool = True, + dtype: str = 'float64', +) -> Tensor: + ... + try: + winfunc = eval('_' + winstr) + except NameError as e: + raise ValueError("Unknown window type.") from e +``` + +### Patches + +We have patched the issue in commit [26c419ca386aeae3c461faf2b828d00b48e908eb](https://github.com/PaddlePaddle/Paddle/commit/26c419ca386aeae3c461faf2b828d00b48e908eb). + +The fix will be included in PaddlePaddle 2.4. + +### For more information + +Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions. + +### Attribution + +This vulnerability has been reported by Tong Liu of ShanghaiTech University. diff --git a/security/advisory/pdsa-2022-002_cn.md b/security/advisory/pdsa-2022-002_cn.md new file mode 100644 index 0000000000000..84fc365fbbcd8 --- /dev/null +++ b/security/advisory/pdsa-2022-002_cn.md @@ -0,0 +1,33 @@ +## PDSA-2022-002: Code injection in paddle.audio.functional.get_window + +### 影响 + +`paddle.audio.functional.get_window`由于使用`eval`用户提供的参数`winstr`而存在代码注入漏洞,将导致任意代码执行。 + +```python +def get_window( + window: Union[str, Tuple[str, float]], + win_length: int, + fftbins: bool = True, + dtype: str = 'float64', +) -> Tensor: + ... + try: + winfunc = eval('_' + winstr) + except NameError as e: + raise ValueError("Unknown window type.") from e +``` + +### 补丁 + +我们在commit [26c419ca386aeae3c461faf2b828d00b48e908eb](https://github.com/PaddlePaddle/Paddle/commit/26c419ca386aeae3c461faf2b828d00b48e908eb)中对此问题进行了补丁。 + +修复将包含在飞桨2.4版本当中。 + +### 更多信息 + +请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息,以及如何与我们联系问题。 + +### 贡献者 + +此漏洞由 Tong Liu of ShanghaiTech University 提交。 From 974f8f321b4dff97073944e0b5aab6118ceca610 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Tue, 1 Nov 2022 12:59:55 +0800 Subject: [PATCH 52/91] fix:add no support for cuda_arch<700 (#47509) --- paddle/fluid/operators/math/bert_encoder_functor.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 18e5ee845d26e..5b11eee61a0fd 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -836,7 +836,13 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, FINAL_MASK); } else { if (bias_is_mask) { -#ifndef __HIPCC__ +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) + PADDLE_ENFORCE_EQ(bias_is_mask, + false, + platform::errors::InvalidArgument( + "QK_bias is mask can't be supported on rocm or " + "cuda_arch<700")); +#else constexpr int ITEMS_PER_THREAD = 1; bool is_half2 = true; @@ -853,11 +859,6 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, batch_size, head_num, seq_len); -#else - PADDLE_ENFORCE_EQ(bias_is_mask, - false, - platform::errors::InvalidArgument( - "rocm can't support that QK_bias is mask")); #endif } else { SoftmaxKernelWithEltadd2<__half2><<>>( From e6a6b0fc088f03a518670e14b9e025fcdd39db41 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 1 Nov 2022 13:43:30 +0800 Subject: [PATCH 53/91] Fix set_attr modify underly type (#47500) --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 37645ef4ab326..4d8e356d63d8c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6100,7 +6100,7 @@ def _inference_optimize(self, prune_read_op=True): for j in range(block.op_size()): op = block.op(j) if op.has_attr('is_test'): - op._set_attr('is_test', True) + op._set_bool_attr('is_test', True) if op.type() == "batch_norm": # Remove the output ReserveSpace of batch_norm if exists. op.remove_output("ReserveSpace") From fece00d0d32451516b29793164ad8555241367b0 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 1 Nov 2022 13:50:30 +0800 Subject: [PATCH 54/91] [Tools]Add autoflake pre-commit hook to remove unused-imports/var (#47455) * [Tools]Add autoflake pre-commit hook to remove unused-imports/var * add more args test=document_fix --- .pre-commit-config.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 336617ee49cb3..b7574f2320c17 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -111,3 +111,14 @@ repos: hooks: - id: cmakelint args: [--config=./tools/codestyle/.cmakelintrc] + +- repo: https://github.com/PyCQA/autoflake + rev: v1.7.7 + hooks: + - id: autoflake + args: + - --in-place + - --remove-all-unused-imports + - --ignore-pass-after-docstring + - --ignore-init-module-imports + - --exclude=python/paddle/fluid/[!t]**,python/paddle/fluid/tra** From ad251cb51d47e28a711853b29d275e5ca8c9000d Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 1 Nov 2022 14:07:24 +0800 Subject: [PATCH 55/91] add missing scale parameter (#47519) --- python/paddle/distributed/fleet/utils/hybrid_parallel_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index c88a967035874..74ccd16656724 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -193,7 +193,7 @@ def fused_allreduce_gradients_with_group( else _apply_collective_grads ) with framework.no_grad(): - apply_func(parameter_list, group, bucket_size) + apply_func(parameter_list, group, bucket_size, scale) def fused_allreduce_gradients(parameter_list, hcg): From 8a1124b1cca12fc1e1447108bac24cff5356883c Mon Sep 17 00:00:00 2001 From: shentanyue <34421038+shentanyue@users.noreply.github.com> Date: Tue, 1 Nov 2022 14:17:06 +0800 Subject: [PATCH 56/91] [Lite][XPU] Upgrade lite subgraph api of xpu (#47373) --- paddle/fluid/inference/analysis/argument.h | 1 + paddle/fluid/inference/analysis/ir_pass_manager.cc | 2 ++ .../inference/analysis/ir_passes/lite_subgraph_pass.cc | 3 ++- paddle/fluid/inference/api/analysis_config.cc | 6 +++++- paddle/fluid/inference/api/analysis_predictor.cc | 1 + paddle/fluid/inference/api/paddle_analysis_config.h | 5 ++++- paddle/fluid/inference/capi_exp/pd_config.cc | 6 ++++-- paddle/fluid/inference/capi_exp/pd_config.h | 4 +++- paddle/fluid/inference/goapi/config.go | 9 +++++---- paddle/fluid/inference/lite/engine.cc | 1 + paddle/fluid/inference/lite/engine.h | 1 + paddle/fluid/pybind/inference_api.cc | 3 ++- 12 files changed, 31 insertions(+), 11 deletions(-) mode change 100755 => 100644 paddle/fluid/inference/analysis/argument.h diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h old mode 100755 new mode 100644 index d855dc999cab8..b2cdc1a369c36 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -288,6 +288,7 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); + DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool); DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 97ca7c37c7f0e..4551d8dbf5225 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -263,6 +263,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("precision", new std::string(argument->xpu_precision())); pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); pass->Set("xpu_device_id", new int(argument->xpu_device_id())); + pass->Set("enable_multi_stream", + new bool(argument->xpu_enable_multi_stream())); // NNAdapter Related pass->Set("use_nnadapter", new bool(argument->use_nnadapter())); pass->Set("nnadapter_model_cache_dir", diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 1c67923657029..577e3df2e68f8 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -258,6 +258,7 @@ void LiteSubgraphPass::SetUpEngine( std::string autotune_file = Get("autotune_file"); std::string precision = Get("precision"); bool adaptive_seqlen = Get("adaptive_seqlen"); + bool enable_multi_stream = Get("enable_multi_stream"); // NNAdapter Related bool use_nnadapter = Get("use_nnadapter"); std::string nnadapter_model_cache_dir = @@ -302,7 +303,6 @@ void LiteSubgraphPass::SetUpEngine( // input tensor of the Lite engine is located, and then affects // whether tensor sharing is feasible. paddle::lite_api::Place({target_type, precision_type}), - paddle::lite_api::Place({target_type, PRECISION(kInt64)}), paddle::lite_api::Place({target_type, PRECISION(kFloat)}), #ifdef PADDLE_WITH_ARM paddle::lite_api::Place({TARGET(kARM), precision_type}), @@ -321,6 +321,7 @@ void LiteSubgraphPass::SetUpEngine( config.autotune_file = autotune_file; config.precision = precision; config.adaptive_seqlen = adaptive_seqlen; + config.enable_multi_stream = enable_multi_stream; // NNAdapter Related config.nnadapter_model_cache_dir = nnadapter_model_cache_dir; config.nnadapter_device_names = nnadapter_device_names; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index be09976bc4d0e..8c9f02a4d37b3 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -137,7 +137,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool autotune, const std::string &autotune_file, const std::string &precision, - bool adaptive_seqlen) { + bool adaptive_seqlen, + bool enable_multi_stream) { use_xpu_ = true; xpu_l3_workspace_size_ = l3_workspace_size; xpu_locked_ = locked; @@ -145,6 +146,7 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, xpu_autotune_file_ = autotune_file; xpu_precision_ = precision; xpu_adaptive_seqlen_ = adaptive_seqlen; + xpu_enable_multi_stream_ = enable_multi_stream; Update(); } @@ -439,6 +441,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(xpu_autotune_file_); CP_MEMBER(xpu_precision_); CP_MEMBER(xpu_adaptive_seqlen_); + CP_MEMBER(xpu_enable_multi_stream_); // NPU related. CP_MEMBER(use_npu_); @@ -1020,6 +1023,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << xpu_autotune_file_; ss << xpu_precision_; ss << xpu_adaptive_seqlen_; + ss << xpu_enable_multi_stream_; ss << use_npu_; ss << npu_device_id_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8663ec7d1f09b..9197efc2a5edb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1148,6 +1148,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetXpuPrecision(config_.xpu_precision_); argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); argument_.SetXpuDeviceId(config_.xpu_device_id_); + argument_.SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_); // NNAdapter related argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter); argument_.SetNNAdapterDeviceNames( diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 5bc50515bf40a..0ed5380e6755c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -274,13 +274,15 @@ struct PD_INFER_DECL AnalysisConfig { /// file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length + /// \param enable_multi_stream Whether to enable the multi stream of xpu. /// void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, bool autotune = true, const std::string& autotune_file = "", const std::string& precision = "int16", - bool adaptive_seqlen = false); + bool adaptive_seqlen = false, + bool enable_multi_stream = false); /// /// \brief configs of IPU @@ -1102,6 +1104,7 @@ struct PD_INFER_DECL AnalysisConfig { std::string xpu_autotune_file_; std::string xpu_precision_; bool xpu_adaptive_seqlen_; + bool xpu_enable_multi_stream_; // NNAdapter related LiteNNAdapterConfig nnadapter_config_; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index b183ba8c63b25..6ff88beb70225 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -155,14 +155,16 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, PD_Bool autotune, const char* autotune_file, const char* precision, - PD_Bool adaptive_seqlen) { + PD_Bool adaptive_seqlen, + PD_Bool enable_multi_stream) { CHECK_AND_CONVERT_PD_CONFIG; config->EnableXpu(l3_workspace_size, locked, autotune, autotune_file, precision, - adaptive_seqlen); + adaptive_seqlen, + enable_multi_stream); } void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) { diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index a7054d5390838..feb1d5724438a 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -200,6 +200,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( /// file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length +/// \param enable_multi_stream Whether to enable the multi stream of xpu. /// PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( __pd_keep PD_Config* pd_config, @@ -208,7 +209,8 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( PD_Bool autotune, const char* autotune_file, const char* precision, - PD_Bool adaptive_seqlen); + PD_Bool adaptive_seqlen, + PD_Bool enable_multi_stream); /// /// \brief Turn on NPU. /// diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index 0aca2a1075fd3..508ac63529560 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -199,8 +199,9 @@ func (config *Config) EnableORTOptimization() { /// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length +/// \param enable_multi_stream Whether to enable the multi stream of xpu /// -func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool) { +func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) { cAutotuneFile := C.CString(autotuneFile) cPrecision := C.CString(precision) defer func() { @@ -208,7 +209,7 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo C.free(unsafe.Pointer(cPrecision)) }() C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune), - cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen)) + cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream)) } /// @@ -332,9 +333,9 @@ func (config *Config) IrOptim() bool { /// \param useCalibMode Use TRT int8 calibration(post training /// quantization). /// -func (config *Config) EnableTensorRtEngine(workspaceSize int32, maxBatchSize int32, minSubgraphSize int32, +func (config *Config) EnableTensorRtEngine(workspaceSize int64, maxBatchSize int32, minSubgraphSize int32, precision Precision, useStatic bool, useCalibMode bool) { - C.PD_ConfigEnableTensorRtEngine(config.c, C.int32_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode)) + C.PD_ConfigEnableTensorRtEngine(config.c, C.int64_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode)) } /// diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 300ff99602ccf..3a60077e9fa0b 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -65,6 +65,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, cfg.adaptive_seqlen); lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); + lite_cxx_config.enable_xpu_multi_stream(cfg.enable_multi_stream); #endif #ifdef LITE_SUBGRAPH_WITH_NPU diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index adeaca7c1c3b7..bc38b5efaeb87 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -50,6 +50,7 @@ struct EngineConfig { std::string autotune_file = ""; std::string precision = "int16"; bool adaptive_seqlen = false; + bool enable_multi_stream = false; // for x86 or arm int cpu_math_library_num_threads{1}; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index d067f8e47fc48..5d2a579907883 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -656,7 +656,8 @@ void BindAnalysisConfig(py::module *m) { py::arg("autotune") = true, py::arg("autotune_file") = "", py::arg("precision") = "int16", - py::arg("adaptive_seqlen") = false) + py::arg("adaptive_seqlen") = false, + py::arg("enable_multi_stream") = false) .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId, py::arg("device_id") = 0) From 9ad0e37eb572330f909da81b3cc3b9f167f41b21 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 1 Nov 2022 14:53:37 +0800 Subject: [PATCH 57/91] fix memory copy in prepare_data of FusedMultiTransformer pass (#47306) * fix memory copy in prepare_data. test=develop --- .../fused_multi_transformer_decoder_pass.cc | 278 ++++------------- .../ir/fused_multi_transformer_decoder_pass.h | 18 -- ...d_multi_transformer_decoder_pass_tester.cc | 66 ++-- .../fused_multi_transformer_encoder_pass.cc | 287 ++++-------------- .../ir/fused_multi_transformer_encoder_pass.h | 18 -- ...d_multi_transformer_encoder_pass_tester.cc | 66 ++-- paddle/fluid/framework/ir/pass.cc | 1 + 7 files changed, 154 insertions(+), 580 deletions(-) diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc index 5559499e0b4b2..ef896e9c7e8ff 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc @@ -237,15 +237,7 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr()) ->assert_is_op_output("softmax") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_qk = - pattern->NewNode(dropout_qk_repr())->assert_is_op("dropout"); - auto* dropout_qk_out_var = - pattern->NewNode(dropout_qk_out_repr()) - ->assert_is_op_output("dropout", "Out") - ->AsIntermediate() - ->assert_is_op_input("matmul_v2", "X"); // -> matmul_qkv + ->assert_is_op_input("matmul_v2", "X"); // QK path Linsk matmul_qk->LinksFrom({transpose2_0_out_var, concat_0_out_var}) @@ -253,7 +245,6 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) .LinksTo({eltadd_qk_out_var}); softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - dropout_qk->LinksFrom({softmax_qk_out_var}).LinksTo({dropout_qk_out_var}); // QKV path Nodes auto* matmul_qkv = @@ -294,14 +285,7 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_linear = - pattern->NewNode(dropout_linear_repr())->assert_is_op("dropout"); - auto* dropout_linear_out_var = pattern->NewNode(dropout_linear_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() - ->assert_is_op_input("elementwise_add"); + ->assert_is_op_input("elementwise_add"); auto* eltadd_out = pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add"); @@ -310,7 +294,7 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { ->AsIntermediate(); // QKV path Links - matmul_qkv->LinksFrom({dropout_qk_out_var, concat_1_out_var}) + matmul_qkv->LinksFrom({softmax_qk_out_var, concat_1_out_var}) .LinksTo({matmul_qkv_out_var}); transpose2_qkv->LinksFrom({matmul_qkv_out_var}) .LinksTo({transpose2_qkv_out_var}); @@ -320,9 +304,7 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { .LinksTo({matmul_linear_out_var}); eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var}) .LinksTo({eltadd_linear_out_var}); - dropout_linear->LinksFrom({eltadd_linear_out_var}) - .LinksTo({dropout_linear_out_var}); - eltadd_out->LinksFrom({input0, dropout_linear_out_var}) + eltadd_out->LinksFrom({input0, eltadd_linear_out_var}) .LinksTo({attention_output}); // Feed Forward LayerNorm Nodes @@ -358,7 +340,7 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { ffn_layer_norm_mean_var, ffn_layer_norm_variance_var}); - // Feed Forward fc1 -> gelu -> fc2 -> dropout + // Feed Forward fc1 -> gelu -> fc2 auto* ffn_matmul0 = pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2"); auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr()) @@ -403,13 +385,6 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* ffn_dropout = - pattern->NewNode(ffn_dropout_repr())->assert_is_op("dropout"); - auto* ffn_dropout_out_var = pattern->NewNode(ffn_dropout_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() ->assert_is_op_input("elementwise_add"); auto* ffn_eltadd_out = @@ -427,9 +402,8 @@ PDNode* FusedMultiTransformerDecoderPattern::operator()() { .LinksTo({ffn_matmul1_out_var}); ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var}) .LinksTo({ffn_eltadd1_out_var}); - ffn_dropout->LinksFrom({ffn_eltadd1_out_var}).LinksTo({ffn_dropout_out_var}); - ffn_eltadd_out->LinksFrom({attention_output, ffn_dropout_out_var}) + ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var}) .LinksTo({ffn_output}); return ffn_output; @@ -575,15 +549,7 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr()) ->assert_is_op_output("softmax") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_qk = - pattern->NewNode(dropout_qk_repr())->assert_is_op("dropout"); - auto* dropout_qk_out_var = - pattern->NewNode(dropout_qk_out_repr()) - ->assert_is_op_output("dropout", "Out") - ->AsIntermediate() - ->assert_is_op_input("matmul_v2", "X"); // -> matmul_qkv + ->assert_is_op_input("matmul_v2", "X"); // QK path Linsk matmul_qk->LinksFrom({split0_q_out_var, concat_k_out_var}) @@ -591,7 +557,6 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) .LinksTo({eltadd_qk_out_var}); softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - dropout_qk->LinksFrom({softmax_qk_out_var}).LinksTo({dropout_qk_out_var}); // QKV path Nodes auto* matmul_qkv = @@ -632,14 +597,7 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_linear = - pattern->NewNode(dropout_linear_repr())->assert_is_op("dropout"); - auto* dropout_linear_out_var = pattern->NewNode(dropout_linear_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() - ->assert_is_op_input("elementwise_add"); + ->assert_is_op_input("elementwise_add"); auto* eltadd_out = pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add"); @@ -648,7 +606,7 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { ->AsIntermediate(); // QKV path Links - matmul_qkv->LinksFrom({dropout_qk_out_var, concat_v_out_var}) + matmul_qkv->LinksFrom({softmax_qk_out_var, concat_v_out_var}) .LinksTo({matmul_qkv_out_var}); transpose2_qkv->LinksFrom({matmul_qkv_out_var}) .LinksTo({transpose2_qkv_out_var}); @@ -658,9 +616,7 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { .LinksTo({matmul_linear_out_var}); eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var}) .LinksTo({eltadd_linear_out_var}); - dropout_linear->LinksFrom({eltadd_linear_out_var}) - .LinksTo({dropout_linear_out_var}); - eltadd_out->LinksFrom({input0, dropout_linear_out_var}) + eltadd_out->LinksFrom({input0, eltadd_linear_out_var}) .LinksTo({attention_output}); // Feed Forward LayerNorm Nodes @@ -696,7 +652,7 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { ffn_layer_norm_mean_var, ffn_layer_norm_variance_var}); - // Feed Forward fc1 -> gelu -> fc2 -> dropout + // Feed Forward fc1 -> gelu -> fc2 auto* ffn_matmul0 = pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2"); auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr()) @@ -741,13 +697,6 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* ffn_dropout = - pattern->NewNode(ffn_dropout_repr())->assert_is_op("dropout"); - auto* ffn_dropout_out_var = pattern->NewNode(ffn_dropout_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() ->assert_is_op_input("elementwise_add"); auto* ffn_eltadd_out = @@ -765,9 +714,8 @@ PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() { .LinksTo({ffn_matmul1_out_var}); ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var}) .LinksTo({ffn_eltadd1_out_var}); - ffn_dropout->LinksFrom({ffn_eltadd1_out_var}).LinksTo({ffn_dropout_out_var}); - ffn_eltadd_out->LinksFrom({attention_output, ffn_dropout_out_var}) + ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var}) .LinksTo({ffn_output}); return ffn_output; @@ -922,15 +870,7 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr()) ->assert_is_op_output("softmax") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_qk = - pattern->NewNode(dropout_qk_repr())->assert_is_op("dropout"); - auto* dropout_qk_out_var = - pattern->NewNode(dropout_qk_out_repr()) - ->assert_is_op_output("dropout", "Out") - ->AsIntermediate() - ->assert_is_op_input("matmul_v2", "X"); // -> matmul_qkv + ->assert_is_op_input("matmul_v2", "X"); // QK path Linsk matmul_qk->LinksFrom({split0_q_out_var, concat_k_out_var}) @@ -938,7 +878,6 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) .LinksTo({eltadd_qk_out_var}); softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - dropout_qk->LinksFrom({softmax_qk_out_var}).LinksTo({dropout_qk_out_var}); // QKV path Nodes auto* matmul_qkv = @@ -987,14 +926,7 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_linear = - pattern->NewNode(dropout_linear_repr())->assert_is_op("dropout"); - auto* dropout_linear_out_var = pattern->NewNode(dropout_linear_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() - ->assert_is_op_input("elementwise_add"); + ->assert_is_op_input("elementwise_add"); auto* eltadd_out = pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add"); @@ -1003,7 +935,7 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { ->AsIntermediate(); // QKV path Links - matmul_qkv->LinksFrom({dropout_qk_out_var, concat_v_out_var}) + matmul_qkv->LinksFrom({softmax_qk_out_var, concat_v_out_var}) .LinksTo({matmul_qkv_out_var}); transpose2_qkv->LinksFrom({matmul_qkv_out_var}) .LinksTo({transpose2_qkv_out_var}); @@ -1015,9 +947,7 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { .LinksTo({c_allreduce_sum_out_var}); eltadd_linear->LinksFrom({c_allreduce_sum_out_var, eltadd_linear_b_var}) .LinksTo({eltadd_linear_out_var}); - dropout_linear->LinksFrom({eltadd_linear_out_var}) - .LinksTo({dropout_linear_out_var}); - eltadd_out->LinksFrom({input0, dropout_linear_out_var}) + eltadd_out->LinksFrom({input0, eltadd_linear_out_var}) .LinksTo({attention_output}); // Feed Forward LayerNorm Nodes @@ -1063,7 +993,7 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { ffn_c_identity->LinksFrom({ffn_layer_norm_out_var}) .LinksTo({ffn_c_identity_out_var}); - // Feed Forward fc1 -> gelu -> fc2 -> dropout + // Feed Forward fc1 -> gelu -> fc2 auto* ffn_matmul0 = pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2"); auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr()) @@ -1117,13 +1047,6 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* ffn_dropout = - pattern->NewNode(ffn_dropout_repr())->assert_is_op("dropout"); - auto* ffn_dropout_out_var = pattern->NewNode(ffn_dropout_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() ->assert_is_op_input("elementwise_add"); auto* ffn_eltadd_out = @@ -1143,9 +1066,8 @@ PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() { .LinksTo({ffn_c_allreduce_sum_out_var}); ffn_eltadd1->LinksFrom({ffn_c_allreduce_sum_out_var, ffn_eltadd1_b_var}) .LinksTo({ffn_eltadd1_out_var}); - ffn_dropout->LinksFrom({ffn_eltadd1_out_var}).LinksTo({ffn_dropout_out_var}); - ffn_eltadd_out->LinksFrom({attention_output, ffn_dropout_out_var}) + ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var}) .LinksTo({ffn_output}); return ffn_output; @@ -1180,11 +1102,9 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, Node* transpose2_1_out, Node* transpose2_2_out, Node* eltadd_qk_b, - Node* dropout_qk, Node* reshape2_0, Node* matmul_linear_w, Node* eltadd_linear_b, - Node* dropout_linear, Node* ffn_layer_norm, Node* ffn_layer_norm_scale, Node* ffn_layer_norm_bias, @@ -1194,7 +1114,6 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, Node* ffn_matmul1_w, Node* ffn_eltadd0_b, Node* ffn_eltadd1_b, - Node* ffn_dropout, Node* ffn_output) { // Calc index of transformer layer by LayerNorm Scale name // This calculation assumes: @@ -1287,14 +1206,8 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, "epsilon", layer_norm->Op()->GetAttr("epsilon")); // output dropout attribute - auto* dropout_op = dropout_linear->Op(); - fused_multi_transformer_op_desc.SetAttr( - "dropout_rate", dropout_op->GetAttr("dropout_prob")); - fused_multi_transformer_op_desc.SetAttr("is_test", - dropout_op->GetAttr("is_test")); - fused_multi_transformer_op_desc.SetAttr( - "dropout_implementation", - dropout_op->GetAttr("dropout_implementation")); + fused_multi_transformer_op_desc.SetAttr("is_test", true); + fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f); auto* fused_multi_transformer = graph->CreateOpNode(&fused_multi_transformer_op_desc); @@ -1313,6 +1226,15 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, IR_NODE_LINK_TO(slice_op, slice_out); IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); + IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer); + IR_NODE_LINK_TO(fused_multi_transformer, ffn_output); }; @@ -1451,11 +1373,6 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH( ffn_eltadd1_out, ffn_eltadd1_out, fused_multi_transformer_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout, ffn_dropout, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout_out, ffn_dropout_out, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_pattern) GET_IR_NODE_FROM_SUBGRAPH( @@ -1499,10 +1416,6 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, softmax_qk, softmax_qk, fused_multi_transformer_pattern); GET_IR_NODE_FROM_SUBGRAPH( softmax_qk_out, softmax_qk_out, fused_multi_transformer_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk, dropout_qk, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk_out, dropout_qk_out, fused_multi_transformer_pattern) GET_IR_NODE_FROM_SUBGRAPH( matmul_qkv, matmul_qkv, fused_multi_transformer_pattern); @@ -1531,10 +1444,6 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, eltadd_linear_b, eltadd_linear_b, fused_multi_transformer_pattern) GET_IR_NODE_FROM_SUBGRAPH( eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - dropout_linear, dropout_linear, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - dropout_linear_out, dropout_linear_out, fused_multi_transformer_pattern) GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_pattern) @@ -1554,11 +1463,9 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, transpose2_1_out, transpose2_2_out, eltadd_qk_b, - dropout_qk, reshape2_0, matmul_linear_w, eltadd_linear_b, - dropout_linear, ffn_layer_norm, ffn_layer_norm_scale, ffn_layer_norm_bias, @@ -1568,12 +1475,9 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, ffn_matmul1_w, ffn_eltadd0_b, ffn_eltadd1_b, - ffn_dropout, ffn_output); std::unordered_set marked_nodes({layer_norm, - layer_norm_scale, - layer_norm_bias, layer_norm_mean, layer_norm_variance, layer_norm_out, @@ -1613,8 +1517,6 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, eltadd_qk_out, softmax_qk, softmax_qk_out, - dropout_qk, - dropout_qk_out, transpose2_qkv, transpose2_qkv_out, matmul_qkv, @@ -1623,17 +1525,11 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, transpose2_qkv, transpose2_qkv_out, matmul_linear, - matmul_linear_w, matmul_linear_out, eltadd_linear, - eltadd_linear_b, eltadd_linear_out, - dropout_linear, - dropout_linear_out, eltadd_out, ffn_layer_norm, - ffn_layer_norm_scale, - ffn_layer_norm_bias, ffn_layer_norm_mean, ffn_layer_norm_variance, ffn_layer_norm_out, @@ -1647,8 +1543,6 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, ffn_eltadd1_out, ffn_gelu, ffn_gelu_out, - ffn_dropout, - ffn_dropout_out, ffn_eltadd_out}); // Remove unneeded nodes. @@ -1850,11 +1744,9 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( Node* matmul0_w, Node* eltadd0_b, Node* eltadd_qk_b, - Node* dropout_qk, Node* reshape2_0, Node* matmul_linear_w, Node* eltadd_linear_b, - Node* dropout_linear, Node* ffn_layer_norm, Node* ffn_layer_norm_scale, Node* ffn_layer_norm_bias, @@ -1864,7 +1756,6 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( Node* ffn_matmul1_w, Node* ffn_eltadd0_b, Node* ffn_eltadd1_b, - Node* ffn_dropout, Node* ffn_output) { // Calc index of transformer layer by LayerNorm Scale name // This calculation assumes: @@ -1957,17 +1848,8 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( "epsilon", layer_norm->Op()->GetAttr("epsilon")); // output dropout attribute - auto* dropout_op = dropout_linear->Op(); - fused_multi_transformer_op_desc.SetAttr( - "dropout_rate", dropout_op->GetAttr("dropout_prob")); - fused_multi_transformer_op_desc.SetAttr("is_test", - dropout_op->GetAttr("is_test")); - fused_multi_transformer_op_desc.SetAttr( - "dropout_implementation", - dropout_op->GetAttr("dropout_implementation")); - - // fused_multi_transformer_op_desc.SetAttr("act_method", {"gelu"}); - // fused_multi_transformer_op_desc.SetAttr("trans_qkvw", {true}); + fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f); + fused_multi_transformer_op_desc.SetAttr("is_test", true); auto* fused_multi_transformer = graph->CreateOpNode(&fused_multi_transformer_op_desc); @@ -1986,6 +1868,15 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( IR_NODE_LINK_TO(slice_op, slice_out); IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); + IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer); + IR_NODE_LINK_TO(fused_multi_transformer, ffn_output); }; @@ -2116,12 +2007,6 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout, ffn_dropout, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_dropout_out, - ffn_dropout_out, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -2153,11 +2038,6 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk, dropout_qk, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_qk_out, - dropout_qk_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern); @@ -2193,12 +2073,6 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear, - dropout_linear, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear_out, - dropout_linear_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -2212,11 +2086,9 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( matmul0_w, eltadd0_b, eltadd_qk_b, - dropout_qk, reshape2_0, matmul_linear_w, eltadd_linear_b, - dropout_linear, ffn_layer_norm, ffn_layer_norm_scale, ffn_layer_norm_bias, @@ -2226,12 +2098,9 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( ffn_matmul1_w, ffn_eltadd0_b, ffn_eltadd1_b, - ffn_dropout, ffn_output); std::unordered_set marked_nodes({layer_norm, - layer_norm_scale, - layer_norm_bias, layer_norm_mean, layer_norm_variance, layer_norm_out, @@ -2261,8 +2130,6 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( eltadd_qk_out, softmax_qk, softmax_qk_out, - dropout_qk, - dropout_qk_out, transpose2_qkv, transpose2_qkv_out, matmul_qkv, @@ -2271,17 +2138,11 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( transpose2_qkv, transpose2_qkv_out, matmul_linear, - matmul_linear_w, matmul_linear_out, eltadd_linear, - eltadd_linear_b, eltadd_linear_out, - dropout_linear, - dropout_linear_out, eltadd_out, ffn_layer_norm, - ffn_layer_norm_scale, - ffn_layer_norm_bias, ffn_layer_norm_mean, ffn_layer_norm_variance, ffn_layer_norm_out, @@ -2295,8 +2156,6 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, ffn_gelu, ffn_gelu_out, - ffn_dropout, - ffn_dropout_out, ffn_eltadd_out}); // Remove unneeded nodes. @@ -2500,11 +2359,9 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( Node* matmul0_w, Node* eltadd0_b, Node* eltadd_qk_b, - Node* dropout_qk, Node* reshape2_0, Node* matmul_linear_w, Node* eltadd_linear_b, - Node* dropout_linear, Node* ffn_layer_norm, Node* ffn_layer_norm_scale, Node* ffn_layer_norm_bias, @@ -2514,7 +2371,6 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( Node* ffn_matmul1_w, Node* ffn_eltadd0_b, Node* ffn_eltadd1_b, - Node* ffn_dropout, Node* ffn_output) { // Calc index of transformer layer by LayerNorm Scale name // This calculation assumes: @@ -2607,23 +2463,14 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( "epsilon", layer_norm->Op()->GetAttr("epsilon")); // output dropout attribute - auto* dropout_op = dropout_linear->Op(); - fused_multi_transformer_op_desc.SetAttr( - "dropout_rate", dropout_op->GetAttr("dropout_prob")); - fused_multi_transformer_op_desc.SetAttr("is_test", - dropout_op->GetAttr("is_test")); - fused_multi_transformer_op_desc.SetAttr( - "dropout_implementation", - dropout_op->GetAttr("dropout_implementation")); + fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f); + fused_multi_transformer_op_desc.SetAttr("is_test", true); // parallel ring id auto* c_identity_op = c_identity->Op(); fused_multi_transformer_op_desc.SetAttr("ring_id", c_identity_op->GetAttr("ring_id")); - // fused_multi_transformer_op_desc.SetAttr("act_method", {"gelu"}); - // fused_multi_transformer_op_desc.SetAttr("trans_qkvw", {true}); - auto* fused_multi_transformer = graph->CreateOpNode(&fused_multi_transformer_op_desc); IR_NODE_LINK_TO(input0, fused_multi_transformer); @@ -2641,6 +2488,15 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( IR_NODE_LINK_TO(slice_op, slice_out); IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); + IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer); + IR_NODE_LINK_TO(fused_multi_transformer, ffn_output); }; @@ -2790,12 +2646,6 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout, ffn_dropout, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_dropout_out, - ffn_dropout_out, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -2827,11 +2677,6 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk, dropout_qk, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_qk_out, - dropout_qk_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern); @@ -2873,12 +2718,6 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear, - dropout_linear, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear_out, - dropout_linear_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -2893,11 +2732,9 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( matmul0_w, eltadd0_b, eltadd_qk_b, - dropout_qk, reshape2_0, matmul_linear_w, eltadd_linear_b, - dropout_linear, ffn_layer_norm, ffn_layer_norm_scale, ffn_layer_norm_bias, @@ -2907,12 +2744,9 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( ffn_matmul1_w, ffn_eltadd0_b, ffn_eltadd1_b, - ffn_dropout, ffn_output); std::unordered_set marked_nodes({layer_norm, - layer_norm_scale, - layer_norm_bias, layer_norm_mean, layer_norm_variance, layer_norm_out, @@ -2944,8 +2778,6 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( eltadd_qk_out, softmax_qk, softmax_qk_out, - dropout_qk, - dropout_qk_out, transpose2_qkv, transpose2_qkv_out, matmul_qkv, @@ -2954,19 +2786,13 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( transpose2_qkv, transpose2_qkv_out, matmul_linear, - matmul_linear_w, matmul_linear_out, c_allreduce_sum, c_allreduce_sum_out, eltadd_linear, - eltadd_linear_b, eltadd_linear_out, - dropout_linear, - dropout_linear_out, eltadd_out, ffn_layer_norm, - ffn_layer_norm_scale, - ffn_layer_norm_bias, ffn_layer_norm_mean, ffn_layer_norm_variance, ffn_layer_norm_out, @@ -2984,8 +2810,6 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, ffn_gelu, ffn_gelu_out, - ffn_dropout, - ffn_dropout_out, ffn_eltadd_out}); // Remove unneeded nodes. diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h index 0f9aae4c57d45..fd2cfc8c6677e 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h +++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h @@ -88,8 +88,6 @@ struct FusedMultiTransformerDecoderPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_qk_out); PATTERN_DECL_NODE(softmax_qk); PATTERN_DECL_NODE(softmax_qk_out); - PATTERN_DECL_NODE(dropout_qk); - PATTERN_DECL_NODE(dropout_qk_out); // QK, V matmul PATTERN_DECL_NODE(matmul_qkv); @@ -106,8 +104,6 @@ struct FusedMultiTransformerDecoderPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_linear); PATTERN_DECL_NODE(eltadd_linear_b); PATTERN_DECL_NODE(eltadd_linear_out); - PATTERN_DECL_NODE(dropout_linear); - PATTERN_DECL_NODE(dropout_linear_out); // output elementwise_add PATTERN_DECL_NODE(eltadd_out) @@ -137,8 +133,6 @@ struct FusedMultiTransformerDecoderPattern : public PatternBase { PATTERN_DECL_NODE(ffn_eltadd1); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_b); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_out); - PATTERN_DECL_NODE(ffn_dropout); - PATTERN_DECL_NODE(ffn_dropout_out); // output elementwise_add PATTERN_DECL_NODE(ffn_eltadd_out) @@ -193,8 +187,6 @@ struct FusedMultiTransformerDecoderFuseQKVPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_qk_out); PATTERN_DECL_NODE(softmax_qk); PATTERN_DECL_NODE(softmax_qk_out); - PATTERN_DECL_NODE(dropout_qk); - PATTERN_DECL_NODE(dropout_qk_out); // QK, V matmul PATTERN_DECL_NODE(matmul_qkv); @@ -211,8 +203,6 @@ struct FusedMultiTransformerDecoderFuseQKVPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_linear); PATTERN_DECL_NODE(eltadd_linear_b); PATTERN_DECL_NODE(eltadd_linear_out); - PATTERN_DECL_NODE(dropout_linear); - PATTERN_DECL_NODE(dropout_linear_out); // output elementwise_add PATTERN_DECL_NODE(eltadd_out) @@ -239,8 +229,6 @@ struct FusedMultiTransformerDecoderFuseQKVPattern : public PatternBase { PATTERN_DECL_NODE(ffn_eltadd1); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_b); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_out); - PATTERN_DECL_NODE(ffn_dropout); - PATTERN_DECL_NODE(ffn_dropout_out); // output elementwise_add PATTERN_DECL_NODE(ffn_eltadd_out) @@ -299,8 +287,6 @@ struct MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern PATTERN_DECL_NODE(eltadd_qk_out); PATTERN_DECL_NODE(softmax_qk); PATTERN_DECL_NODE(softmax_qk_out); - PATTERN_DECL_NODE(dropout_qk); - PATTERN_DECL_NODE(dropout_qk_out); // QK, V matmul PATTERN_DECL_NODE(matmul_qkv); @@ -319,8 +305,6 @@ struct MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern PATTERN_DECL_NODE(eltadd_linear); PATTERN_DECL_NODE(eltadd_linear_b); PATTERN_DECL_NODE(eltadd_linear_out); - PATTERN_DECL_NODE(dropout_linear); - PATTERN_DECL_NODE(dropout_linear_out); // output elementwise_add PATTERN_DECL_NODE(eltadd_out) @@ -351,8 +335,6 @@ struct MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern PATTERN_DECL_NODE(ffn_eltadd1); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_b); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_out); - PATTERN_DECL_NODE(ffn_dropout); - PATTERN_DECL_NODE(ffn_dropout_out); // output elementwise_add PATTERN_DECL_NODE(ffn_eltadd_out) diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc index edf00ae17c7ad..dbb6781442492 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc @@ -85,13 +85,11 @@ TEST(FusedMultiTransformerDecoderPass, basic) { // (transpose_0, transpose_1) matmul -> matmul_qk // (matmul_qk, bias_qk) elementwise_add -> eltadd_qk // (eltadd_qk) softmax -> softmax_qk - // (softmax_qk) dropout -> dropout_qk - // (dropout_qk, transpose_2) matmul_v2 -> matmul_qkv + // (softmax_qk, transpose_2) matmul_v2 -> matmul_qkv // (matmul_qkv) transpose -> transpose_qkv // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) matmul_v2 -> matmul_linear // (matmul_linear) elementwise_add -> eltadd_linear - // (eltadd_linear) dropout -> dropout_linear // (eltadd_out) elementwise_add -> attention_out // // (attention_out, scale, bias) layer_norm -> ffn_layer_norm_out @@ -100,8 +98,7 @@ TEST(FusedMultiTransformerDecoderPass, basic) { // (ffn_eltadd0) gelu -> ffn_gelu // (ffn_gelu) matmul_v2 -> ffn_matmul1 // (ffn_matmul1, ffn_bias1) elementwise_add -> ffn_eltadd1 - // (ffn_eltadd1) dropout -> ffn_dropout - // (attention_out, ffn_dropout) elementwise_add -> ffn_output + // (attention_out, ffn_eltadd1) elementwise_add -> ffn_output Layers layers; // MHA: pre LayerNorm @@ -154,10 +151,9 @@ TEST(FusedMultiTransformerDecoderPass, basic) { auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk); auto* softmax_qk = layers.softmax(elementwise_qk, -1); - auto* dropout_qk = layers.dropout(softmax_qk, 0.1, "upscale_in_train"); // MHA: QKV matmul - auto* matmul_qkv = layers.matmul_v2(dropout_qk, concat_v); + auto* matmul_qkv = layers.matmul_v2(softmax_qk, concat_v); auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true); @@ -170,9 +166,7 @@ TEST(FusedMultiTransformerDecoderPass, basic) { auto* linear_eltadd_out = layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2); - auto* dropout_qkv = - layers.dropout(linear_eltadd_out, 0.1, "upscale_in_train"); - auto* attention_out = layers.elementwise_add(x, dropout_qkv); + auto* attention_out = layers.elementwise_add(x, linear_eltadd_out); // FFN: pre LayerNorm auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true); @@ -195,9 +189,7 @@ TEST(FusedMultiTransformerDecoderPass, basic) { auto* ffn_eltadd1_out = layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2); - // FFN: dropout -> elementwise_add - auto* ffn_dropout = layers.dropout(ffn_eltadd1_out, 0.1, "upscale_in_train"); - layers.elementwise_add(attention_out, ffn_dropout); + layers.elementwise_add(attention_out, ffn_eltadd1_out); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); @@ -215,12 +207,12 @@ TEST(FusedMultiTransformerDecoderPass, basic) { int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer"); PADDLE_ENFORCE_EQ(num_nodes_before, - num_nodes_after + 72, + num_nodes_after + 60, platform::errors::InvalidArgument( "After the fused_multi_transformer_decoder_pass, The " "node num in graph " "should be %d, but the result is %d", - num_nodes_before - 72, + num_nodes_before - 60, num_nodes_after)); PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1, @@ -253,13 +245,11 @@ TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) { // (split_q, split_k) matmul -> matmul_qk // (matmul_qk, bias_qk) elementwise_add -> eltadd_qk // (eltadd_qk) softmax -> softmax_qk - // (softmax_qk) dropout -> dropout_qk - // (dropout_qk, transpose_2) matmul_v2 -> matmul_qkv + // (softmax_qk, transpose_2) matmul_v2 -> matmul_qkv // (matmul_qkv) transpose -> transpose_qkv // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) matmul_v2 -> matmul_linear // (matmul_linear) elementwise_add -> eltadd_linear - // (eltadd_linear) dropout -> dropout_linear // (eltadd_out) elementwise_add -> attention_out // // (attention_out, scale, bias) layer_norm -> ffn_layer_norm_out @@ -268,8 +258,7 @@ TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) { // (ffn_eltadd0) gelu -> ffn_gelu // (ffn_gelu) matmul_v2 -> ffn_matmul1 // (ffn_matmul1, ffn_bias1) elementwise_add -> ffn_eltadd1 - // (ffn_eltadd1) dropout -> ffn_dropout - // (attention_out, ffn_dropout) elementwise_add -> ffn_output + // (attention_out, ffn_eltadd1) elementwise_add -> ffn_output // // (transpose_1, transpose_2) while -> decoder block @@ -313,10 +302,9 @@ TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) { auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk); auto* softmax_qk = layers.softmax(elementwise_qk, -1); - auto* dropout_qk = layers.dropout(softmax_qk, 0.1, "upscale_in_train"); // MHA: QKV matmul - auto* matmul_qkv = layers.matmul_v2(dropout_qk, concat_v); + auto* matmul_qkv = layers.matmul_v2(softmax_qk, concat_v); auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true); @@ -329,9 +317,7 @@ TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) { auto* linear_eltadd_out = layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2); - auto* dropout_qkv = - layers.dropout(linear_eltadd_out, 0.1, "upscale_in_train"); - auto* attention_out = layers.elementwise_add(x, dropout_qkv); + auto* attention_out = layers.elementwise_add(x, linear_eltadd_out); // FFN: pre LayerNorm auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true); @@ -354,9 +340,7 @@ TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) { auto* ffn_eltadd1_out = layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2); - // FFN: dropout -> elementwise_add - auto* ffn_dropout = layers.dropout(ffn_eltadd1_out, 0.1, "upscale_in_train"); - layers.elementwise_add(attention_out, ffn_dropout); + layers.elementwise_add(attention_out, ffn_eltadd1_out); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); @@ -375,11 +359,11 @@ TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) { PADDLE_ENFORCE_EQ( num_nodes_before, - num_nodes_after + 62, + num_nodes_after + 50, platform::errors::InvalidArgument( "After the fused_multi_transformer_decoder_fuse_qkv_pass, " "The node num in graph should be %d, but the result is %d", - num_nodes_before - 62, + num_nodes_before - 50, num_nodes_after)); PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1, @@ -413,14 +397,12 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) { // (split_q, split_k) matmul -> matmul_qk // (matmul_qk, bias_qk) elementwise_add -> eltadd_qk // (eltadd_qk) softmax -> softmax_qk - // (softmax_qk) dropout -> dropout_qk - // (dropout_qk, transpose_2) matmul_v2 -> matmul_qkv + // (softmax_qk, transpose_2) matmul_v2 -> matmul_qkv // (matmul_qkv) transpose -> transpose_qkv // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) matmul_v2 -> matmul_linear // (matmul_linear) c_allreduce_sum -> c_all_reduce_out // (matmul_linear) elementwise_add -> eltadd_linear - // (eltadd_linear) dropout -> dropout_linear // (eltadd_out) elementwise_add -> attention_out // // (attention_out, scale, bias) layer_norm -> ffn_layer_norm_out @@ -431,8 +413,7 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) { // (ffn_gelu) matmul_v2 -> ffn_matmul1 // (ffn_matmul1) c_allreduce_sum -> c_allreduce_out // (ffn_matmul1, ffn_bias1) elementwise_add -> ffn_eltadd1 - // (ffn_eltadd1) dropout -> ffn_dropout - // (attention_out, ffn_dropout) elementwise_add -> ffn_output + // (attention_out, ffn_eltadd1) elementwise_add -> ffn_output // // (transpose_1, transpose_2) while -> decoder block @@ -477,10 +458,9 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) { auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk); auto* softmax_qk = layers.softmax(elementwise_qk, -1); - auto* dropout_qk = layers.dropout(softmax_qk, 0.1, "upscale_in_train"); // MHA: QKV matmul - auto* matmul_qkv = layers.matmul_v2(dropout_qk, concat_v); + auto* matmul_qkv = layers.matmul_v2(softmax_qk, concat_v); auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true); @@ -494,9 +474,7 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) { auto* linear_eltadd_out = layers.elementwise_add(c_allreduce_out, bias_l, nullptr, 2); - auto* dropout_qkv = - layers.dropout(linear_eltadd_out, 0.1, "upscale_in_train"); - auto* attention_out = layers.elementwise_add(x, dropout_qkv); + auto* attention_out = layers.elementwise_add(x, linear_eltadd_out); // FFN: pre LayerNorm auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true); @@ -521,9 +499,7 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) { auto* ffn_eltadd1_out = layers.elementwise_add(ffn_c_allreduce_out, ffn_bias1, nullptr, 2); - // FFN: dropout -> elementwise_add - auto* ffn_dropout = layers.dropout(ffn_eltadd1_out, 0.1, "upscale_in_train"); - layers.elementwise_add(attention_out, ffn_dropout); + layers.elementwise_add(attention_out, ffn_eltadd1_out); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); @@ -544,11 +520,11 @@ TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) { PADDLE_ENFORCE_EQ( num_nodes_before, - num_nodes_after + 70, + num_nodes_after + 58, platform::errors::InvalidArgument( "After the fused_multi_transformer_decoder_fuse_qkv_pass, " "The node num in graph should be %d, but the result is %d", - num_nodes_before - 70, + num_nodes_before - 58, num_nodes_after)); PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1, diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc index d93c29765649a..8738779f5efc9 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc @@ -227,15 +227,7 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr()) ->assert_is_op_output("softmax") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_qk = - pattern->NewNode(dropout_qk_repr())->assert_is_op("dropout"); - auto* dropout_qk_out_var = - pattern->NewNode(dropout_qk_out_repr()) - ->assert_is_op_output("dropout", "Out") - ->AsIntermediate() - ->assert_is_op_input("matmul_v2", "X"); // -> matmul_qkv + ->assert_is_op_input("matmul_v2", "X"); // QK path Linsk matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) @@ -243,7 +235,6 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) .LinksTo({eltadd_qk_out_var}); softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - dropout_qk->LinksFrom({softmax_qk_out_var}).LinksTo({dropout_qk_out_var}); // QKV path Nodes auto* matmul_qkv = @@ -284,14 +275,7 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_linear = - pattern->NewNode(dropout_linear_repr())->assert_is_op("dropout"); - auto* dropout_linear_out_var = pattern->NewNode(dropout_linear_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() - ->assert_is_op_input("elementwise_add"); + ->assert_is_op_input("elementwise_add"); auto* eltadd_out = pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add"); @@ -300,7 +284,7 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { ->AsIntermediate(); // QKV path Links - matmul_qkv->LinksFrom({dropout_qk_out_var, transpose2_2_out_var}) + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) .LinksTo({matmul_qkv_out_var}); transpose2_qkv->LinksFrom({matmul_qkv_out_var}) .LinksTo({transpose2_qkv_out_var}); @@ -310,9 +294,7 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { .LinksTo({matmul_linear_out_var}); eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var}) .LinksTo({eltadd_linear_out_var}); - dropout_linear->LinksFrom({eltadd_linear_out_var}) - .LinksTo({dropout_linear_out_var}); - eltadd_out->LinksFrom({input0, dropout_linear_out_var}) + eltadd_out->LinksFrom({input0, eltadd_linear_out_var}) .LinksTo({attention_output}); // while loop @@ -352,7 +334,7 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { ffn_layer_norm_mean_var, ffn_layer_norm_variance_var}); - // Feed Forward fc1 -> gelu -> fc2 -> dropout + // Feed Forward fc1 -> gelu -> fc2 auto* ffn_matmul0 = pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2"); auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr()) @@ -397,13 +379,6 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* ffn_dropout = - pattern->NewNode(ffn_dropout_repr())->assert_is_op("dropout"); - auto* ffn_dropout_out_var = pattern->NewNode(ffn_dropout_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() ->assert_is_op_input("elementwise_add"); auto* ffn_eltadd_out = @@ -421,9 +396,8 @@ PDNode* FusedMultiTransformerEncoderPattern::operator()() { .LinksTo({ffn_matmul1_out_var}); ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var}) .LinksTo({ffn_eltadd1_out_var}); - ffn_dropout->LinksFrom({ffn_eltadd1_out_var}).LinksTo({ffn_dropout_out_var}); - ffn_eltadd_out->LinksFrom({attention_output, ffn_dropout_out_var}) + ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var}) .LinksTo({ffn_output}); return ffn_output; @@ -545,15 +519,7 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr()) ->assert_is_op_output("softmax") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_qk = - pattern->NewNode(dropout_qk_repr())->assert_is_op("dropout"); - auto* dropout_qk_out_var = - pattern->NewNode(dropout_qk_out_repr()) - ->assert_is_op_output("dropout", "Out") - ->AsIntermediate() - ->assert_is_op_input("matmul_v2", "X"); // -> matmul_qkv + ->assert_is_op_input("matmul_v2", "X"); // QK path Linsk matmul_qk->LinksFrom({split0_q_out_var, split0_k_out_var}) @@ -561,7 +527,6 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) .LinksTo({eltadd_qk_out_var}); softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - dropout_qk->LinksFrom({softmax_qk_out_var}).LinksTo({dropout_qk_out_var}); // QKV path Nodes auto* matmul_qkv = @@ -602,14 +567,7 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_linear = - pattern->NewNode(dropout_linear_repr())->assert_is_op("dropout"); - auto* dropout_linear_out_var = pattern->NewNode(dropout_linear_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() - ->assert_is_op_input("elementwise_add"); + ->assert_is_op_input("elementwise_add"); auto* eltadd_out = pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add"); @@ -618,7 +576,7 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { ->AsIntermediate(); // QKV path Links - matmul_qkv->LinksFrom({dropout_qk_out_var, split0_v_out_var}) + matmul_qkv->LinksFrom({softmax_qk_out_var, split0_v_out_var}) .LinksTo({matmul_qkv_out_var}); transpose2_qkv->LinksFrom({matmul_qkv_out_var}) .LinksTo({transpose2_qkv_out_var}); @@ -628,9 +586,7 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { .LinksTo({matmul_linear_out_var}); eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var}) .LinksTo({eltadd_linear_out_var}); - dropout_linear->LinksFrom({eltadd_linear_out_var}) - .LinksTo({dropout_linear_out_var}); - eltadd_out->LinksFrom({input0, dropout_linear_out_var}) + eltadd_out->LinksFrom({input0, eltadd_linear_out_var}) .LinksTo({attention_output}); // Feed Forward LayerNorm Nodes @@ -666,7 +622,7 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { ffn_layer_norm_mean_var, ffn_layer_norm_variance_var}); - // Feed Forward fc1 -> gelu -> fc2 -> dropout + // Feed Forward fc1 -> gelu -> fc2 auto* ffn_matmul0 = pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2"); auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr()) @@ -711,13 +667,6 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* ffn_dropout = - pattern->NewNode(ffn_dropout_repr())->assert_is_op("dropout"); - auto* ffn_dropout_out_var = pattern->NewNode(ffn_dropout_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() ->assert_is_op_input("elementwise_add"); auto* ffn_eltadd_out = @@ -735,9 +684,8 @@ PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() { .LinksTo({ffn_matmul1_out_var}); ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var}) .LinksTo({ffn_eltadd1_out_var}); - ffn_dropout->LinksFrom({ffn_eltadd1_out_var}).LinksTo({ffn_dropout_out_var}); - ffn_eltadd_out->LinksFrom({attention_output, ffn_dropout_out_var}) + ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var}) .LinksTo({ffn_output}); return ffn_output; @@ -868,15 +816,7 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr()) ->assert_is_op_output("softmax") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_qk = - pattern->NewNode(dropout_qk_repr())->assert_is_op("dropout"); - auto* dropout_qk_out_var = - pattern->NewNode(dropout_qk_out_repr()) - ->assert_is_op_output("dropout", "Out") - ->AsIntermediate() - ->assert_is_op_input("matmul_v2", "X"); // -> matmul_qkv + ->assert_is_op_input("matmul_v2", "X"); // QK path Linsk matmul_qk->LinksFrom({split0_q_out_var, split0_k_out_var}) @@ -884,7 +824,6 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) .LinksTo({eltadd_qk_out_var}); softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - dropout_qk->LinksFrom({softmax_qk_out_var}).LinksTo({dropout_qk_out_var}); // QKV path Nodes auto* matmul_qkv = @@ -933,14 +872,7 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* dropout_linear = - pattern->NewNode(dropout_linear_repr())->assert_is_op("dropout"); - auto* dropout_linear_out_var = pattern->NewNode(dropout_linear_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() - ->assert_is_op_input("elementwise_add"); + ->assert_is_op_input("elementwise_add"); auto* eltadd_out = pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add"); @@ -949,7 +881,7 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { ->AsIntermediate(); // QKV path Links - matmul_qkv->LinksFrom({dropout_qk_out_var, split0_v_out_var}) + matmul_qkv->LinksFrom({softmax_qk_out_var, split0_v_out_var}) .LinksTo({matmul_qkv_out_var}); transpose2_qkv->LinksFrom({matmul_qkv_out_var}) .LinksTo({transpose2_qkv_out_var}); @@ -961,9 +893,7 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { .LinksTo({c_allreduce_sum_out_var}); eltadd_linear->LinksFrom({c_allreduce_sum_out_var, eltadd_linear_b_var}) .LinksTo({eltadd_linear_out_var}); - dropout_linear->LinksFrom({eltadd_linear_out_var}) - .LinksTo({dropout_linear_out_var}); - eltadd_out->LinksFrom({input0, dropout_linear_out_var}) + eltadd_out->LinksFrom({input0, eltadd_linear_out_var}) .LinksTo({attention_output}); // Feed Forward LayerNorm Nodes @@ -1009,7 +939,7 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { ffn_c_identity->LinksFrom({ffn_layer_norm_out_var}) .LinksTo({ffn_c_identity_out_var}); - // Feed Forward fc1 -> gelu -> fc2 -> dropout + // Feed Forward fc1 -> gelu -> fc2 auto* ffn_matmul0 = pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2"); auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr()) @@ -1063,13 +993,6 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr()) ->assert_is_op_output("elementwise_add") ->AsIntermediate() - ->assert_is_op_input("dropout"); - - auto* ffn_dropout = - pattern->NewNode(ffn_dropout_repr())->assert_is_op("dropout"); - auto* ffn_dropout_out_var = pattern->NewNode(ffn_dropout_out_repr()) - ->assert_is_op_output("dropout") - ->AsIntermediate() ->assert_is_op_input("elementwise_add"); auto* ffn_eltadd_out = @@ -1089,9 +1012,8 @@ PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() { .LinksTo({ffn_c_allreduce_sum_out_var}); ffn_eltadd1->LinksFrom({ffn_c_allreduce_sum_out_var, ffn_eltadd1_b_var}) .LinksTo({ffn_eltadd1_out_var}); - ffn_dropout->LinksFrom({ffn_eltadd1_out_var}).LinksTo({ffn_dropout_out_var}); - ffn_eltadd_out->LinksFrom({attention_output, ffn_dropout_out_var}) + ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var}) .LinksTo({ffn_output}); return ffn_output; @@ -1253,11 +1175,9 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, Node* transpose2_1_out, Node* transpose2_2_out, Node* eltadd_qk_b, - Node* dropout_qk, Node* reshape2_0, Node* matmul_linear_w, Node* eltadd_linear_b, - Node* dropout_linear, Node* while0, Node* ffn_layer_norm, Node* ffn_layer_norm_scale, @@ -1268,7 +1188,6 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, Node* ffn_matmul1_w, Node* ffn_eltadd0_b, Node* ffn_eltadd1_b, - Node* ffn_dropout, Node* ffn_output) { auto reshape_desc = reshape2_0->Op(); int num_head = @@ -1375,7 +1294,9 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, fill_const_op_desc.SetAttr("input_dim_idx", 0); fill_const_op_desc.SetAttr("output_dim_idx", 1); fill_const_op_desc.SetAttr("value", 0); - fill_const_op_desc.SetAttr("dtype", static_cast(proto::VarType::FP32)); + fill_const_op_desc.SetAttr( + "dtype", + static_cast(framework::TransToProtoVarType(wq_tensor->dtype()))); auto* fill_const_op = graph->CreateOpNode(&fill_const_op_desc); fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv->Name()}); @@ -1409,15 +1330,8 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, fused_multi_transformer_op_desc.SetAttr( "epsilon", layer_norm->Op()->GetAttr("epsilon")); - // output dropout attribute - auto* dropout_op = dropout_linear->Op(); - fused_multi_transformer_op_desc.SetAttr( - "dropout_rate", dropout_op->GetAttr("dropout_prob")); - fused_multi_transformer_op_desc.SetAttr("is_test", - dropout_op->GetAttr("is_test")); - fused_multi_transformer_op_desc.SetAttr( - "dropout_implementation", - dropout_op->GetAttr("dropout_implementation")); + fused_multi_transformer_op_desc.SetAttr("is_test", true); + fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f); auto* fused_multi_transformer = graph->CreateOpNode(&fused_multi_transformer_op_desc); @@ -1433,6 +1347,15 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, IR_NODE_LINK_TO(fill_const_op, cache_kv); IR_NODE_LINK_TO(cache_kv, fused_multi_transformer); + IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); + IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer); + IR_NODE_LINK_TO(fused_multi_transformer, ffn_output); // rewrite while OP input @@ -1620,11 +1543,6 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, GET_IR_NODE_FROM_SUBGRAPH( ffn_eltadd1_out, ffn_eltadd1_out, fused_multi_transformer_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout, ffn_dropout, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout_out, ffn_dropout_out, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_pattern) GET_IR_NODE_FROM_SUBGRAPH( @@ -1668,11 +1586,6 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, softmax_qk, softmax_qk, fused_multi_transformer_pattern); GET_IR_NODE_FROM_SUBGRAPH( softmax_qk_out, softmax_qk_out, fused_multi_transformer_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk, dropout_qk, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk_out, dropout_qk_out, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( matmul_qkv, matmul_qkv, fused_multi_transformer_pattern); GET_IR_NODE_FROM_SUBGRAPH( @@ -1700,11 +1613,6 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, eltadd_linear_b, eltadd_linear_b, fused_multi_transformer_pattern) GET_IR_NODE_FROM_SUBGRAPH( eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - dropout_linear, dropout_linear, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( - dropout_linear_out, dropout_linear_out, fused_multi_transformer_pattern) - GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_pattern) @@ -1723,11 +1631,9 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, transpose2_1_out, transpose2_2_out, eltadd_qk_b, - dropout_qk, reshape2_0, matmul_linear_w, eltadd_linear_b, - dropout_linear, while0, ffn_layer_norm, ffn_layer_norm_scale, @@ -1738,12 +1644,9 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, ffn_matmul1_w, ffn_eltadd0_b, ffn_eltadd1_b, - ffn_dropout, ffn_output); std::unordered_set marked_nodes({layer_norm, - layer_norm_scale, - layer_norm_bias, layer_norm_mean, layer_norm_variance, layer_norm_out, @@ -1777,8 +1680,6 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, eltadd_qk_out, softmax_qk, softmax_qk_out, - dropout_qk, - dropout_qk_out, transpose2_qkv, transpose2_qkv_out, matmul_qkv, @@ -1787,17 +1688,11 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, transpose2_qkv, transpose2_qkv_out, matmul_linear, - matmul_linear_w, matmul_linear_out, eltadd_linear, - eltadd_linear_b, eltadd_linear_out, - dropout_linear, - dropout_linear_out, eltadd_out, ffn_layer_norm, - ffn_layer_norm_scale, - ffn_layer_norm_bias, ffn_layer_norm_mean, ffn_layer_norm_variance, ffn_layer_norm_out, @@ -1811,8 +1706,6 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph, ffn_eltadd1_out, ffn_gelu, ffn_gelu_out, - ffn_dropout, - ffn_dropout_out, ffn_eltadd_out}); // Remove unneeded nodes. @@ -2016,11 +1909,9 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( Node* split0_k_out, Node* split0_v_out, Node* eltadd_qk_b, - Node* dropout_qk, Node* reshape2_0, Node* matmul_linear_w, Node* eltadd_linear_b, - Node* dropout_linear, Node* while0, Node* ffn_layer_norm, Node* ffn_layer_norm_scale, @@ -2031,7 +1922,6 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( Node* ffn_matmul1_w, Node* ffn_eltadd0_b, Node* ffn_eltadd1_b, - Node* ffn_dropout, Node* ffn_output) { auto reshape_desc = reshape2_0->Op(); int num_head = @@ -2104,7 +1994,9 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( fill_const_op_desc.SetAttr("input_dim_idx", 0); fill_const_op_desc.SetAttr("output_dim_idx", 1); fill_const_op_desc.SetAttr("value", 0); - fill_const_op_desc.SetAttr("dtype", static_cast(proto::VarType::FP32)); + fill_const_op_desc.SetAttr("dtype", + static_cast(framework::TransToProtoVarType( + qkv_w_tensor->dtype()))); auto* fill_const_op = graph->CreateOpNode(&fill_const_op_desc); fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv->Name()}); @@ -2139,14 +2031,8 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( "epsilon", layer_norm->Op()->GetAttr("epsilon")); // output dropout attribute - auto* dropout_op = dropout_linear->Op(); - fused_multi_transformer_op_desc.SetAttr( - "dropout_rate", dropout_op->GetAttr("dropout_prob")); - fused_multi_transformer_op_desc.SetAttr("is_test", - dropout_op->GetAttr("is_test")); - fused_multi_transformer_op_desc.SetAttr( - "dropout_implementation", - dropout_op->GetAttr("dropout_implementation")); + fused_multi_transformer_op_desc.SetAttr("is_test", true); + fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f); auto* fused_multi_transformer = graph->CreateOpNode(&fused_multi_transformer_op_desc); @@ -2162,6 +2048,15 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( IR_NODE_LINK_TO(fill_const_op, cache_kv); IR_NODE_LINK_TO(cache_kv, fused_multi_transformer); + IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); + IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer); + IR_NODE_LINK_TO(fused_multi_transformer, ffn_output); // rewrite while OP input @@ -2315,12 +2210,6 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout, ffn_dropout, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_dropout_out, - ffn_dropout_out, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -2352,11 +2241,6 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk, dropout_qk, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_qk_out, - dropout_qk_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern); @@ -2392,12 +2276,6 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear, - dropout_linear, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear_out, - dropout_linear_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -2416,11 +2294,9 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( split0_k_out, split0_v_out, eltadd_qk_b, - dropout_qk, reshape2_0, matmul_linear_w, eltadd_linear_b, - dropout_linear, while0, ffn_layer_norm, ffn_layer_norm_scale, @@ -2431,12 +2307,9 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( ffn_matmul1_w, ffn_eltadd0_b, ffn_eltadd1_b, - ffn_dropout, ffn_output); std::unordered_set marked_nodes({layer_norm, - layer_norm_scale, - layer_norm_bias, layer_norm_mean, layer_norm_variance, layer_norm_out, @@ -2458,8 +2331,6 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( eltadd_qk_out, softmax_qk, softmax_qk_out, - dropout_qk, - dropout_qk_out, transpose2_qkv, transpose2_qkv_out, matmul_qkv, @@ -2468,17 +2339,11 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( transpose2_qkv, transpose2_qkv_out, matmul_linear, - matmul_linear_w, matmul_linear_out, eltadd_linear, - eltadd_linear_b, eltadd_linear_out, - dropout_linear, - dropout_linear_out, eltadd_out, ffn_layer_norm, - ffn_layer_norm_scale, - ffn_layer_norm_bias, ffn_layer_norm_mean, ffn_layer_norm_variance, ffn_layer_norm_out, @@ -2492,8 +2357,6 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, ffn_gelu, ffn_gelu_out, - ffn_dropout, - ffn_dropout_out, ffn_eltadd_out}); // Remove unneeded nodes. @@ -2700,11 +2563,9 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( Node* split0_k_out, Node* split0_v_out, Node* eltadd_qk_b, - Node* dropout_qk, Node* reshape2_0, Node* matmul_linear_w, Node* eltadd_linear_b, - Node* dropout_linear, Node* while0, Node* ffn_layer_norm, Node* ffn_layer_norm_scale, @@ -2715,7 +2576,6 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( Node* ffn_matmul1_w, Node* ffn_eltadd0_b, Node* ffn_eltadd1_b, - Node* ffn_dropout, Node* ffn_output) { auto reshape_desc = reshape2_0->Op(); int num_head = @@ -2789,7 +2649,9 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( fill_const_op_desc.SetAttr("input_dim_idx", 0); fill_const_op_desc.SetAttr("output_dim_idx", 1); fill_const_op_desc.SetAttr("value", 0); - fill_const_op_desc.SetAttr("dtype", static_cast(proto::VarType::FP32)); + fill_const_op_desc.SetAttr("dtype", + static_cast(framework::TransToProtoVarType( + qkv_w_tensor->dtype()))); auto* fill_const_op = graph->CreateOpNode(&fill_const_op_desc); fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv->Name()}); @@ -2824,14 +2686,8 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( "epsilon", layer_norm->Op()->GetAttr("epsilon")); // output dropout attribute - auto* dropout_op = dropout_linear->Op(); - fused_multi_transformer_op_desc.SetAttr( - "dropout_rate", dropout_op->GetAttr("dropout_prob")); - fused_multi_transformer_op_desc.SetAttr("is_test", - dropout_op->GetAttr("is_test")); - fused_multi_transformer_op_desc.SetAttr( - "dropout_implementation", - dropout_op->GetAttr("dropout_implementation")); + fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f); + fused_multi_transformer_op_desc.SetAttr("is_test", true); // parallel ring id auto* c_identity_op = c_identity->Op(); @@ -2852,6 +2708,15 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( IR_NODE_LINK_TO(fill_const_op, cache_kv); IR_NODE_LINK_TO(cache_kv, fused_multi_transformer); + IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); + IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer); + IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer); + IR_NODE_LINK_TO(fused_multi_transformer, ffn_output); // rewrite while OP input @@ -3024,12 +2889,6 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - ffn_dropout, ffn_dropout, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_dropout_out, - ffn_dropout_out, - fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_fuse_qkv_pattern) @@ -3061,11 +2920,6 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - dropout_qk, dropout_qk, fused_multi_transformer_fuse_qkv_pattern) - GET_IR_NODE_FROM_SUBGRAPH(dropout_qk_out, - dropout_qk_out, - fused_multi_transformer_fuse_qkv_pattern) GET_IR_NODE_FROM_SUBGRAPH( matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern); @@ -3107,12 +2961,6 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear, - dropout_linear, - fused_multi_transformer_fuse_qkv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(dropout_linear_out, - dropout_linear_out, - fused_multi_transformer_fuse_qkv_pattern); GET_IR_NODE_FROM_SUBGRAPH( eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern); @@ -3132,11 +2980,9 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( split0_k_out, split0_v_out, eltadd_qk_b, - dropout_qk, reshape2_0, matmul_linear_w, eltadd_linear_b, - dropout_linear, while0, ffn_layer_norm, ffn_layer_norm_scale, @@ -3147,12 +2993,9 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( ffn_matmul1_w, ffn_eltadd0_b, ffn_eltadd1_b, - ffn_dropout, ffn_output); std::unordered_set marked_nodes({layer_norm, - layer_norm_scale, - layer_norm_bias, layer_norm_mean, layer_norm_variance, layer_norm_out, @@ -3176,8 +3019,6 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( eltadd_qk_out, softmax_qk, softmax_qk_out, - dropout_qk, - dropout_qk_out, transpose2_qkv, transpose2_qkv_out, matmul_qkv, @@ -3186,19 +3027,13 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( transpose2_qkv, transpose2_qkv_out, matmul_linear, - matmul_linear_w, matmul_linear_out, c_allreduce_sum, c_allreduce_sum_out, eltadd_linear, - eltadd_linear_b, eltadd_linear_out, - dropout_linear, - dropout_linear_out, eltadd_out, ffn_layer_norm, - ffn_layer_norm_scale, - ffn_layer_norm_bias, ffn_layer_norm_mean, ffn_layer_norm_variance, ffn_layer_norm_out, @@ -3216,8 +3051,6 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion( ffn_eltadd1_out, ffn_gelu, ffn_gelu_out, - ffn_dropout, - ffn_dropout_out, ffn_eltadd_out}); // Remove unneeded nodes. diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h index 6e62a69cdf144..55792456b8c83 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h +++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h @@ -82,8 +82,6 @@ struct FusedMultiTransformerEncoderPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_qk_out); PATTERN_DECL_NODE(softmax_qk); PATTERN_DECL_NODE(softmax_qk_out); - PATTERN_DECL_NODE(dropout_qk); - PATTERN_DECL_NODE(dropout_qk_out); // QK, V matmul PATTERN_DECL_NODE(matmul_qkv); @@ -100,8 +98,6 @@ struct FusedMultiTransformerEncoderPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_linear); PATTERN_DECL_NODE(eltadd_linear_b); PATTERN_DECL_NODE(eltadd_linear_out); - PATTERN_DECL_NODE(dropout_linear); - PATTERN_DECL_NODE(dropout_linear_out); // output elementwise_add PATTERN_DECL_NODE(eltadd_out) @@ -131,8 +127,6 @@ struct FusedMultiTransformerEncoderPattern : public PatternBase { PATTERN_DECL_NODE(ffn_eltadd1); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_b); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_out); - PATTERN_DECL_NODE(ffn_dropout); - PATTERN_DECL_NODE(ffn_dropout_out); // output elementwise_add PATTERN_DECL_NODE(ffn_eltadd_out) @@ -179,8 +173,6 @@ struct FusedMultiTransformerEncoderFuseQKVPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_qk_out); PATTERN_DECL_NODE(softmax_qk); PATTERN_DECL_NODE(softmax_qk_out); - PATTERN_DECL_NODE(dropout_qk); - PATTERN_DECL_NODE(dropout_qk_out); // QK, V matmul PATTERN_DECL_NODE(matmul_qkv); @@ -200,8 +192,6 @@ struct FusedMultiTransformerEncoderFuseQKVPattern : public PatternBase { PATTERN_DECL_NODE(eltadd_linear); PATTERN_DECL_NODE(eltadd_linear_b); PATTERN_DECL_NODE(eltadd_linear_out); - PATTERN_DECL_NODE(dropout_linear); - PATTERN_DECL_NODE(dropout_linear_out); // output elementwise_add PATTERN_DECL_NODE(eltadd_out) @@ -228,8 +218,6 @@ struct FusedMultiTransformerEncoderFuseQKVPattern : public PatternBase { PATTERN_DECL_NODE(ffn_eltadd1); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_b); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_out); - PATTERN_DECL_NODE(ffn_dropout); - PATTERN_DECL_NODE(ffn_dropout_out); // output elementwise_add PATTERN_DECL_NODE(ffn_eltadd_out) @@ -280,8 +268,6 @@ struct MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern PATTERN_DECL_NODE(eltadd_qk_out); PATTERN_DECL_NODE(softmax_qk); PATTERN_DECL_NODE(softmax_qk_out); - PATTERN_DECL_NODE(dropout_qk); - PATTERN_DECL_NODE(dropout_qk_out); // QK, V matmul PATTERN_DECL_NODE(matmul_qkv); @@ -303,8 +289,6 @@ struct MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern PATTERN_DECL_NODE(eltadd_linear); PATTERN_DECL_NODE(eltadd_linear_b); PATTERN_DECL_NODE(eltadd_linear_out); - PATTERN_DECL_NODE(dropout_linear); - PATTERN_DECL_NODE(dropout_linear_out); // output elementwise_add PATTERN_DECL_NODE(eltadd_out) @@ -335,8 +319,6 @@ struct MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern PATTERN_DECL_NODE(ffn_eltadd1); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_b); // ELEMENTWISE_ADD PATTERN_DECL_NODE(ffn_eltadd1_out); - PATTERN_DECL_NODE(ffn_dropout); - PATTERN_DECL_NODE(ffn_dropout_out); // output elementwise_add PATTERN_DECL_NODE(ffn_eltadd_out) diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc index b23f8dcac7b10..2e356d0dc1997 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc @@ -81,13 +81,11 @@ TEST(FusedMultiTransformerEncoderPass, basic) { // (transpose_0, transpose_1) matmul -> matmul_qk // (matmul_qk, bias_qk) elementwise_add -> eltadd_qk // (eltadd_qk) softmax -> softmax_qk - // (softmax_qk) dropout -> dropout_qk - // (dropout_qk, transpose_2) matmul_v2 -> matmul_qkv + // (softmax_qk, transpose_2) matmul_v2 -> matmul_qkv // (matmul_qkv) transpose -> transpose_qkv // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) matmul_v2 -> matmul_linear // (matmul_linear) elementwise_add -> eltadd_linear - // (eltadd_linear) dropout -> dropout_linear // (eltadd_out) elementwise_add -> attention_out // // (attention_out, scale, bias) layer_norm -> ffn_layer_norm_out @@ -96,8 +94,7 @@ TEST(FusedMultiTransformerEncoderPass, basic) { // (ffn_eltadd0) gelu -> ffn_gelu // (ffn_gelu) matmul_v2 -> ffn_matmul1 // (ffn_matmul1, ffn_bias1) elementwise_add -> ffn_eltadd1 - // (ffn_eltadd1) dropout -> ffn_dropout - // (attention_out, ffn_dropout) elementwise_add -> ffn_output + // (attention_out, ffn_eltadd1) elementwise_add -> ffn_output // // (transpose_1, transpose_2) while -> decoder block @@ -149,10 +146,9 @@ TEST(FusedMultiTransformerEncoderPass, basic) { auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk, nullptr, -1); auto* softmax_qk = layers.softmax(elementwise_qk, -1); - auto* dropout_qk = layers.dropout(softmax_qk, 0.1, "upscale_in_train"); // MHA: QKV matmul - auto* matmul_qkv = layers.matmul_v2(dropout_qk, transpose_2); + auto* matmul_qkv = layers.matmul_v2(softmax_qk, transpose_2); auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true); @@ -165,9 +161,7 @@ TEST(FusedMultiTransformerEncoderPass, basic) { auto* linear_eltadd_out = layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2); - auto* dropout_qkv = - layers.dropout(linear_eltadd_out, 0.1, "upscale_in_train"); - auto* attention_out = layers.elementwise_add(x, dropout_qkv); + auto* attention_out = layers.elementwise_add(x, linear_eltadd_out); // FFN: pre LayerNorm auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true); @@ -190,9 +184,7 @@ TEST(FusedMultiTransformerEncoderPass, basic) { auto* ffn_eltadd1_out = layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2); - // FFN: dropout -> elementwise_add - auto* ffn_dropout = layers.dropout(ffn_eltadd1_out, 0.1, "upscale_in_train"); - layers.elementwise_add(attention_out, ffn_dropout); + layers.elementwise_add(attention_out, ffn_eltadd1_out); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); @@ -210,12 +202,12 @@ TEST(FusedMultiTransformerEncoderPass, basic) { int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer"); PADDLE_ENFORCE_EQ(num_nodes_before, - num_nodes_after + 68, + num_nodes_after + 56, platform::errors::InvalidArgument( "After the fused_multi_transformer_encoder_pass, The " "node num in graph " "should be %d, but the result is %d", - num_nodes_before - 68, + num_nodes_before - 56, num_nodes_after)); PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1, @@ -246,13 +238,11 @@ TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) { // (split_q, split_k) matmul -> matmul_qk // (matmul_qk, bias_qk) elementwise_add -> eltadd_qk // (eltadd_qk) softmax -> softmax_qk - // (softmax_qk) dropout -> dropout_qk - // (dropout_qk, transpose_2) matmul_v2 -> matmul_qkv + // (softmax_qk, transpose_2) matmul_v2 -> matmul_qkv // (matmul_qkv) transpose -> transpose_qkv // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) matmul_v2 -> matmul_linear // (matmul_linear) elementwise_add -> eltadd_linear - // (eltadd_linear) dropout -> dropout_linear // (eltadd_out) elementwise_add -> attention_out // // (attention_out, scale, bias) layer_norm -> ffn_layer_norm_out @@ -261,8 +251,7 @@ TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) { // (ffn_eltadd0) gelu -> ffn_gelu // (ffn_gelu) matmul_v2 -> ffn_matmul1 // (ffn_matmul1, ffn_bias1) elementwise_add -> ffn_eltadd1 - // (ffn_eltadd1) dropout -> ffn_dropout - // (attention_out, ffn_dropout) elementwise_add -> ffn_output + // (attention_out, ffn_eltadd1) elementwise_add -> ffn_output // // (transpose_1, transpose_2) while -> decoder block @@ -304,10 +293,9 @@ TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) { auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk); auto* softmax_qk = layers.softmax(elementwise_qk, -1); - auto* dropout_qk = layers.dropout(softmax_qk, 0.1, "upscale_in_train"); // MHA: QKV matmul - auto* matmul_qkv = layers.matmul_v2(dropout_qk, split_v); + auto* matmul_qkv = layers.matmul_v2(softmax_qk, split_v); auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true); @@ -320,9 +308,7 @@ TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) { auto* linear_eltadd_out = layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2); - auto* dropout_qkv = - layers.dropout(linear_eltadd_out, 0.1, "upscale_in_train"); - auto* attention_out = layers.elementwise_add(x, dropout_qkv); + auto* attention_out = layers.elementwise_add(x, linear_eltadd_out); // FFN: pre LayerNorm auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true); @@ -345,9 +331,7 @@ TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) { auto* ffn_eltadd1_out = layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2); - // FFN: dropout -> elementwise_add - auto* ffn_dropout = layers.dropout(ffn_eltadd1_out, 0.1, "upscale_in_train"); - layers.elementwise_add(attention_out, ffn_dropout); + layers.elementwise_add(attention_out, ffn_eltadd1_out); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); @@ -366,11 +350,11 @@ TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) { PADDLE_ENFORCE_EQ( num_nodes_before, - num_nodes_after + 56, + num_nodes_after + 44, platform::errors::InvalidArgument( "After the fused_multi_transformer_encoder_fuse_qkv_pass, " "The node num in graph should be %d, but the result is %d", - num_nodes_before - 56, + num_nodes_before - 44, num_nodes_after)); PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1, @@ -402,14 +386,12 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) { // (split_q, split_k) matmul -> matmul_qk // (matmul_qk, bias_qk) elementwise_add -> eltadd_qk // (eltadd_qk) softmax -> softmax_qk - // (softmax_qk) dropout -> dropout_qk - // (dropout_qk, transpose_2) matmul_v2 -> matmul_qkv + // (softmax_qk, transpose_2) matmul_v2 -> matmul_qkv // (matmul_qkv) transpose -> transpose_qkv // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) matmul_v2 -> matmul_linear // (matmul_linear) c_all_reduce -> c_all_reduce_out // (c_all_reduce_out) elementwise_add -> eltadd_linear - // (eltadd_linear) dropout -> dropout_linear // (eltadd_out) elementwise_add -> attention_out // // (attention_out, scale, bias) layer_norm -> ffn_layer_norm_out @@ -420,8 +402,7 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) { // (ffn_gelu) matmul_v2 -> ffn_matmul1 // (ffn_matmul1) c_all_reduce -> ffn_c_all_reduce_out // (ffn_c_all_reduce_out, ffn_bias1)elementwise_add -> ffn_eltadd1 - // (ffn_eltadd1) dropout -> ffn_dropout - // (attention_out, ffn_dropout) elementwise_add -> ffn_output + // (attention_out, ffn_eltadd1) elementwise_add -> ffn_output // // (transpose_1, transpose_2) while -> decoder block @@ -464,10 +445,9 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) { auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk); auto* softmax_qk = layers.softmax(elementwise_qk, -1); - auto* dropout_qk = layers.dropout(softmax_qk, 0.1, "upscale_in_train"); // MHA: QKV matmul - auto* matmul_qkv = layers.matmul_v2(dropout_qk, split_v); + auto* matmul_qkv = layers.matmul_v2(softmax_qk, split_v); auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true); @@ -481,9 +461,7 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) { auto* linear_eltadd_out = layers.elementwise_add(c_allreduce_out, bias_l, nullptr, 2); - auto* dropout_qkv = - layers.dropout(linear_eltadd_out, 0.1, "upscale_in_train"); - auto* attention_out = layers.elementwise_add(x, dropout_qkv); + auto* attention_out = layers.elementwise_add(x, linear_eltadd_out); // FFN: pre LayerNorm auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true); @@ -508,9 +486,7 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) { auto* ffn_eltadd1_out = layers.elementwise_add(ffn_allreduce_out, ffn_bias1, nullptr, 2); - // FFN: dropout -> elementwise_add - auto* ffn_dropout = layers.dropout(ffn_eltadd1_out, 0.1, "upscale_in_train"); - layers.elementwise_add(attention_out, ffn_dropout); + layers.elementwise_add(attention_out, ffn_eltadd1_out); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); @@ -531,11 +507,11 @@ TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) { PADDLE_ENFORCE_EQ( num_nodes_before, - num_nodes_after + 64, + num_nodes_after + 52, platform::errors::InvalidArgument( "After the fused_multi_transformer_encoder_fuse_qkv_pass, " "The node num in graph should be %d, but the result is %d", - num_nodes_before - 64, + num_nodes_before - 52, num_nodes_after)); PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1, diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 4a1bf4baecc14..74ad71f37da69 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -39,6 +39,7 @@ namespace ir { static const char kParamScopeAttr[] = "__param_scope__"; static const std::vector support_subgraph_passes = { + "simplify_with_basic_ops_pass", "fused_multi_transformer_encoder_pass", "fused_multi_transformer_decoder_pass", "fused_multi_transformer_encoder_fuse_qkv_pass", From 3097a66dfe78e88d7a357b6078b6a39654415ecb Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Tue, 1 Nov 2022 14:57:57 +0800 Subject: [PATCH 58/91] Filter npu xpu mlu ipu file (#47512) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修改.gitigore文件,把ljd_sh文件忽略掉 * 取消ljd_sh文件忽略 * filter npu xpu mlu ipu file * filter xpu npu mlu ipu files * filter npu xpu mlu ipu file --- tools/get_pr_ut.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 8962428ba0cab..0788da6e116ec 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -338,6 +338,10 @@ def get_pr_ut(self): file_list.append(filename) else: filterFiles.append(filename) + elif ( + '/xpu/' or '/npu/' or '/mlu/' or '/ipu/' in filename.lower() + ): + filterFiles.append(filename) else: file_list.append(filename) else: From 3592ba8c1c3352228c5870f93462260fce04127c Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 1 Nov 2022 15:10:58 +0800 Subject: [PATCH 59/91] [CodeStyle][py2] remove `six` package (part2) (#47334) * [CodeStyle][py2] remove `six` package (part2) * six.ensure_str * remove unused `import six` * remove six from BUILTIN_LIKELY_MODULES * remove six in example code * remove some decode * try to fix example code * fix MockEtcdClient get/get_prefix returns data type * fix MockEtcdClient get_prefix returns data * fix MockEtcdClient get returns data * remove `six` in pypi and conda requirements * fix MockEtcdClient add_watch_callback/add_watch_prefix_callback returns data type * refine MockEtcdClient --- paddle/scripts/conda_build.py | 2 -- .../distributed/fleet/elastic/manager.py | 12 +++---- .../distributed/launch/controllers/master.py | 17 +++++----- .../quantization/imperative/ptq_quantizer.py | 1 - .../fluid/dataloader/dataloader_iter.py | 13 ++++---- python/paddle/fluid/dataloader/worker.py | 3 +- .../dygraph_to_static/convert_call_func.py | 2 -- python/paddle/fluid/executor.py | 32 ++++++++----------- python/paddle/fluid/framework.py | 4 +-- .../fleet/parameter_server/ir/public.py | 1 - python/paddle/fluid/layers/io.py | 5 ++- python/paddle/fluid/layers/nn.py | 7 ++-- python/paddle/fluid/layers/utils.py | 1 - python/paddle/fluid/reader.py | 23 +++++++------ .../tests/unittests/npu/test_save_load_npu.py | 1 - .../unittests/test_fleet_elastic_manager.py | 26 +++++++++++---- python/paddle/reader/decorator.py | 13 ++++---- python/requirements.txt | 1 - 18 files changed, 77 insertions(+), 87 deletions(-) diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py index 12004460a5036..98692ff3df39c 100644 --- a/paddle/scripts/conda_build.py +++ b/paddle/scripts/conda_build.py @@ -55,7 +55,6 @@ def __init__(self): - protobuf>=3.1.0 - gast==0.3.3 - Pillow - - six - decorator - astor """ @@ -67,7 +66,6 @@ def __init__(self): - protobuf>=3.1.0 - gast==0.3.3 - Pillow - - six - decorator - astor """ diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py index 9397f6d10d4a6..b23a116422590 100644 --- a/python/paddle/distributed/fleet/elastic/manager.py +++ b/python/paddle/distributed/fleet/elastic/manager.py @@ -15,7 +15,6 @@ import time import socket import os -import six import copy import signal import random @@ -244,8 +243,7 @@ def __init__(self, args, etcd_client): # register callback def host_call_back(event): self.hosts = [ - six.ensure_str(i[0]) - for i in self.etcd.get_prefix(self.node_prefix) + i[0].decode() for i in self.etcd.get_prefix(self.node_prefix) ] self.hosts = list(set(self.hosts)) if self.hosts else self.hosts logger.info( @@ -266,7 +264,7 @@ def lease_heartbeat(): host_lease.refresh() hosts = [ - six.ensure_str(i[0]) + i[0].decode() for i in self.etcd.get_prefix(self.node_prefix) ] hosts = list(set(hosts)) if hosts else hosts @@ -311,7 +309,8 @@ def lease_heartbeat(): def endpoints_call_back(event): if not self.dist_endpoints: return - edps = six.ensure_str(self.etcd.get(self.endpoints_path)[0] or '') + value = self.etcd.get(self.endpoints_path)[0] + edps = value.decode() if value is not None else '' self.dist_endpoints, self.trainers = edps.split('|') logger.info( "set DISTRIBUTED_TRAINER_ENDPOINTS {} ".format( @@ -426,8 +425,7 @@ def _match(self, host_list: list = None): self.hosts = host_list else: self.hosts = [ - six.ensure_str(i[0]) - for i in self.etcd.get_prefix(self.node_prefix) + i[0].decode() for i in self.etcd.get_prefix(self.node_prefix) ] self.hosts = list(set(self.hosts)) if self.hosts else self.hosts diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py index 128852e092cfb..9c3f0a8501a3a 100644 --- a/python/paddle/distributed/launch/controllers/master.py +++ b/python/paddle/distributed/launch/controllers/master.py @@ -17,7 +17,6 @@ import time import sys -import six import threading import copy import random @@ -214,22 +213,22 @@ def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): if len(result) == size: if rank < 0: - keys = [six.ensure_str(i[1].key) for i in result] - sorted_keys = [six.ensure_str(i[1].key) for i in result] + keys = [i[1].key.decode() for i in result] + sorted_keys = [i[1].key.decode() for i in result] sorted_keys.sort() - values = [six.ensure_str(i[0]) for i in result] + values = [i[0].decode() for i in result] ret = [values[keys.index(k)] for k in sorted_keys] idx = ret.index(value) return ret, idx else: ret = [None] * size for v, k in result: - ii = int(six.ensure_str(k.key).split('/')[-1]) + ii = int(k.key.decode().split('/')[-1]) if ii < 0: self.ctx.logger.error( "rank {} error in sync".format(ii) ) - ret[ii] = six.ensure_str(v) + ret[ii] = v.decode() return ret, rank else: time.sleep(0.5) @@ -278,8 +277,7 @@ def _heartbeat(): def fetch_peer_alive(self): peer_alive = [ - six.ensure_str(i[0]) - for i in self.client.get_prefix(self.heartbeat_prefix) + i[0].decode() for i in self.client.get_prefix(self.heartbeat_prefix) ] self.ctx.logger.debug("peer alive {}".format(peer_alive)) return peer_alive @@ -319,7 +317,8 @@ def set_status(self, status): ), "set status failed {}".format(status) def get_status(self): - return six.ensure_str(self.client.get(self.job_prefix)[0] or '') + value = self.client.get(self.job_prefix)[0] + return value.decode() if value is not None else '' def stop(self): if hasattr(self, 'beat_thread'): diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py index ba881f88efc39..3dfc95a8ac346 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_quantizer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import six import abc import copy import math diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 83d95c479250e..aa5d71293fdab 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import six import sys import time import signal @@ -284,9 +283,9 @@ def _thread_loop(self, legacy_expected_place): except: self._exit_thread_expectedly() - except: + except Exception as e: self._exit_thread_unexpectedly() - six.reraise(*sys.exc_info()) + raise e self._exit_thread_expectedly() @@ -334,7 +333,7 @@ def __next__(self): except StopIteration: self._reader.shutdown() self._try_shutdown_all() - six.reraise(*sys.exc_info()) + raise finally: if in_profiler_mode(): trace_event.end() @@ -629,7 +628,7 @@ def _thread_loop(self, legacy_expected_place): self._blocking_queue.close() except Exception as e: self._exit_thread_unexpectedly() - six.reraise(*sys.exc_info()) + raise e finally: self._rcvd_idx += 1 @@ -715,7 +714,7 @@ def _get_data(self): "DataLoader reader thread failed({}) to read data from " "workers' result queue.".format(e) ) - six.reraise(*sys.exc_info()) + raise e else: if self._dataset_kind == _DatasetKind.ITER and isinstance( data, _IterableDatasetStopIteration @@ -850,7 +849,7 @@ def __next__(self): if not self._persistent_workers: self._reader.shutdown() self._try_shutdown_all() - six.reraise(*sys.exc_info()) + raise finally: if in_profiler_mode(): trace_event.end() diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py index f0aa32e774522..fef07df92751a 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/fluid/dataloader/worker.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import six import sys import paddle import numpy as np @@ -395,7 +394,7 @@ def tensor_share_memory(tensor): # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process pass except: - six.reraise(*sys.exc_info()) + raise finally: if use_shared_memory: _cleanup_mmap() diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py index 5adf810eef4a3..b43c4e9d36d60 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py @@ -22,7 +22,6 @@ import types import numpy -import six import builtins from paddle.fluid.dygraph.container import Sequential @@ -58,7 +57,6 @@ copy, inspect, re, - six, numpy, logging, ] diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index ba8221bb2f0fb..55a0334c8e933 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -19,7 +19,6 @@ import warnings import numpy as np from .wrapped_decorator import signature_safe_contextmanager -import six from .data_feeder import convert_dtype from .framework import Program, default_main_program, Variable, Operator from .framework import convert_np_dtype_to_dtype_, _apply_pass @@ -1574,23 +1573,20 @@ def run( ] self._log_force_set_program_cache(use_program_cache) - try: - res = self._run_impl( - program=program, - feed=feed, - fetch_list=fetch_list, - feed_var_name=feed_var_name, - fetch_var_name=fetch_var_name, - scope=scope, - return_numpy=return_numpy, - use_program_cache=use_program_cache, - use_prune=use_prune, - return_merged=return_merged, - ) - core.update_autotune_status() - return res - except Exception as e: - six.reraise(*sys.exc_info()) + res = self._run_impl( + program=program, + feed=feed, + fetch_list=fetch_list, + feed_var_name=feed_var_name, + fetch_var_name=fetch_var_name, + scope=scope, + return_numpy=return_numpy, + use_program_cache=use_program_cache, + use_prune=use_prune, + return_merged=return_merged, + ) + core.update_autotune_status() + return res def _run_impl( self, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4d8e356d63d8c..188ff9a8ea8d7 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -5777,10 +5777,10 @@ def clone(self, for_test=False): .. code-block:: python - import six + import paddle def print_prog(prog): - for name, value in sorted(six.iteritems(prog.block(0).vars)): + for name, value in sorted(prog.block(0).vars.items()): print(value) for op in prog.block(0).ops: print("op type is {}".format(op.type)) diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py index 9b762bb9ee7f8..865ffd2e0fd0c 100755 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py @@ -19,7 +19,6 @@ import os import warnings import logging -import six import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.core import CommContext diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 86654bbf66958..758cd94c21421 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -14,7 +14,6 @@ import multiprocessing import os -import six import sys import threading @@ -523,10 +522,10 @@ def __provider_thread__(legacy_expected_place): if reader.exited: break feed_queue.close() - except Exception as ex: + except Exception as e: feed_queue.kill() logging.warn('Your decorated reader has raised an exception!') - six.reraise(*sys.exc_info()) + raise e reader.thread = threading.Thread( target=__provider_thread__, args=(_current_expected_place(),) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 17bd7ff86dbf9..69c00b5dbb1f2 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -14998,7 +14998,6 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None): # example 1: import paddle - import six import numpy as np paddle.enable_static() @@ -15024,7 +15023,7 @@ def create_tmp_var(name, dtype, shape): def simple_net(img, label): hidden = img - for idx in six.moves.range(4): + for idx in range(4): hidden = paddle.static.nn.fc(hidden, size=200) new_hidden = create_tmp_var(name='hidden_{}'.format(idx), dtype=hidden.dtype, shape=hidden.shape) @@ -15042,13 +15041,13 @@ def simple_net(img, label): return ce_loss(prediction, label) x = paddle.static.data(name='x', shape=[1,4], dtype='float32') - y = paddle.static.data(name='y', shape=[1,10], dtype='int64') + y = paddle.static.data(name='y', shape=[1], dtype='int64') res = simple_net(x, y) exe = paddle.static.Executor(paddle.CPUPlace()) exe.run(paddle.static.default_startup_program()) input1 = np.random.random(size=[1,4]).astype('float32') - input2 = np.random.randint(1, 10, size=[1,10], dtype='int64') + input2 = np.random.randint(1, 10, size=[1], dtype='int64') out = exe.run(paddle.static.default_main_program(), feed={'x':input1, 'y':input2}, fetch_list=[res.name]) diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py index e96e105c141df..66fc253bb097b 100644 --- a/python/paddle/fluid/layers/utils.py +++ b/python/paddle/fluid/layers/utils.py @@ -14,7 +14,6 @@ import collections import copy -import six import numpy as np from ..framework import Block, Variable, _non_static_mode from ..data_feeder import ( diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 5c2e92ae458e7..d71a4ab184a0f 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -14,7 +14,6 @@ from . import core import sys -import six import numpy as np import threading import paddle @@ -143,7 +142,7 @@ def _reader_process_loop(batch_reader, data_queue): # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process pass except: - six.reraise(*sys.exc_info()) + raise class DataLoaderBase(object): @@ -1202,7 +1201,7 @@ def __next__(self): return self._reader.read_next_var_list() except StopIteration: self._reset() - six.reraise(*sys.exc_info()) + raise def _exit_thread_expectedly(self): self._thread_done_event.set() @@ -1232,7 +1231,7 @@ def _reader_thread_loop_for_multiprocess(self, legacy_expected_place): # start trying to get data from queue. At this time, the child thread needs # to wait slightly longer tensor_list = self._data_queue.get(timeout=QUEUE_GET_TIMEOUT) - except: + except Exception as e: # NOTE [ avoid handing ] After adding the shared memory mechanism, not only # the queue.Empty exception will occur here, but other exceptions will also # occur, such as mmap failure. If it is not handled here, it will hang. @@ -1240,7 +1239,7 @@ def _reader_thread_loop_for_multiprocess(self, legacy_expected_place): logging.error( "DataLoader reader thread failed to read data from the multiprocessing.Queue." ) - six.reraise(*sys.exc_info()) + raise e if not self._thread_done_event.is_set(): if tensor_list is not None: @@ -1250,9 +1249,9 @@ def _reader_thread_loop_for_multiprocess(self, legacy_expected_place): array.append(tensor) if not self._blocking_queue.push(array): self._blocking_queue.close() - except: + except Exception as e: self._exit_thread_unexpectedly() - six.reraise(*sys.exc_info()) + raise e else: self._exit_thread_expectedly() @@ -1278,13 +1277,13 @@ def _reader_thread_loop_for_singleprocess(self, legacy_expected_place): self._blocking_queue.close() self._thread = None - except Exception: + except Exception as e: self._blocking_queue.kill() self._thread = None logging.warning( "DygraphDataLoader reader thread raised an exception." ) - six.reraise(*sys.exc_info()) + raise e def set_sample_generator( self, reader, batch_size, drop_last=True, places=None @@ -1510,7 +1509,7 @@ def __next__(self): except StopIteration: self._queue.close() self._reset() - six.reraise(*sys.exc_info()) + raise def start(self): assert ( @@ -1551,11 +1550,11 @@ def __thread_main__(legacy_expected_place): self._queue.close() self._thread = None - except Exception as ex: + except Exception as e: self._queue.kill() self._thread = None logging.warning('Your reader has raised an exception!') - six.reraise(*sys.exc_info()) + raise e self._thread = threading.Thread( target=__thread_main__, args=(_current_expected_place(),) diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py index 09f25273696ef..0f86bafa91352 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py @@ -26,7 +26,6 @@ from test_imperative_base import new_program_scope from paddle.fluid.executor import global_scope import numpy as np -import six import pickle import os import errno diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py index a4ad3e1a7c4ea..0570ae1928c12 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py @@ -27,6 +27,16 @@ def refresh(self): pass +class MockKVMetadata: + def __init__(self, key): + self.key = key + self.create_revision = 2 + self.mod_revision = 3 + self.version = 2 + self.lease_id = 0 + self.response_header = None + + class MockEtcdClient: def __init__(self, lease=None): self._lease = lease @@ -35,28 +45,30 @@ def put(self, key, value, lease=None): pass def get(self, key): - value = "0" - return value, value + return b'0', MockKVMetadata(b"/prefix") def delete_prefix(self, key): pass def get_prefix(self, key_prefix): - hosts = ["10.10.10.1:6001", "10.10.10.2:6001"] - return hosts + hosts = [ + (b"/prefix/host1", b"10.10.10.1:6001"), + (b"/prefix/host2", b"10.10.10.2:6001"), + ] + return ((v, MockKVMetadata(k)) for k, v in hosts) def add_watch_callback(self, *args, **kwargs): - return "host_watch" + return 0 def add_watch_prefix_callback(self, key_prefix, callback, **kwargs): callback(None) - return "host_watch" + return 0 def cancel_watch(self, watch_id): pass def delete(self, key): - pass + return True def lease(self, ttl): if self._lease: diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index b2aa88cc810b4..1969c7ba11e9e 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -14,7 +14,6 @@ from threading import Thread import multiprocessing -import six import sys import warnings import logging @@ -610,9 +609,9 @@ def _read_into_queue(reader, queue): raise ValueError("sample has None") queue.put(sample) queue.put(None) - except: + except Exception as e: queue.put("") - six.reraise(*sys.exc_info()) + raise e def queue_reader(): queue = fork_context.Queue(queue_size) @@ -627,11 +626,11 @@ def queue_reader(): while finish_num < reader_num: try: sample = queue.get(timeout=QUEUE_GET_TIMEOUT) - except: + except Exception as e: logging.error( "multiprocess_reader failed to get data from the multiprocessing.Queue." ) - six.reraise(*sys.exc_info()) + raise e if sample is None: finish_num += 1 @@ -650,10 +649,10 @@ def _read_into_pipe(reader, conn): conn.send(json.dumps(sample)) conn.send(json.dumps(None)) conn.close() - except: + except Exception as e: conn.send(json.dumps("")) conn.close() - six.reraise(*sys.exc_info()) + raise e def pipe_reader(): conns = [] diff --git a/python/requirements.txt b/python/requirements.txt index 74f2c2b9401aa..fcdfddc9e2eb6 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -2,7 +2,6 @@ requests>=2.20.0 numpy>=1.13 protobuf>=3.1.0, <=3.20.0 Pillow -six decorator astor paddle_bfloat==0.1.7 From e930c5763e063c1e009798a487b87ebb2d6092eb Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 1 Nov 2022 15:58:58 +0800 Subject: [PATCH 60/91] [EinsumOp] Einsum support complex grad (#47514) * Einsum Support Complex * code fix * add unittest for complex grad with einsum * set rtol=1e-4 * fix --- paddle/phi/kernels/impl/einsum_grad_impl.h | 8 ++- .../fluid/tests/unittests/test_einsum_v2.py | 65 ++++++++++++++++++- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h index bf27f3ef2b1be..816badcd79e55 100644 --- a/paddle/phi/kernels/impl/einsum_grad_impl.h +++ b/paddle/phi/kernels/impl/einsum_grad_impl.h @@ -15,6 +15,7 @@ #include "paddle/fluid/platform/profiler.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/impl/einsum_impl.h" #include "paddle/phi/kernels/tile_kernel.h" #include "paddle/utils/string/string_helper.h" @@ -177,11 +178,12 @@ void EinsumGradKernel(const Context& dev_ctx, auto operands_for_A = std::vector(); auto operands_for_B = std::vector(); DenseTensor dA, dB; + auto out_grad_conj = Conj(dev_ctx, out_grad); // dA = einsum(B, dC) operands_for_A.push_back(x[1]); - operands_for_A.push_back(&out_grad); + operands_for_A.push_back(&out_grad_conj); // dB = einsum(dC, A) - operands_for_B.push_back(&out_grad); + operands_for_B.push_back(&out_grad_conj); operands_for_B.push_back(x[0]); DenseTensor before_tile; @@ -219,6 +221,7 @@ void EinsumGradKernel(const Context& dev_ctx, ellipsis_dims[0], ops[0], dA); + *(x_grad[0]) = Conj(dev_ctx, *x_grad[0]); } if (x_grad[1]) { *(x_grad[1]) = PerformTileAndReduction(dev_ctx, @@ -228,6 +231,7 @@ void EinsumGradKernel(const Context& dev_ctx, ellipsis_dims[1], ops[1], dB); + *(x_grad[1]) = Conj(dev_ctx, *x_grad[1]); } } } diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py index c7d2f9c76b250..f45f9ace1cc64 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py @@ -580,7 +580,7 @@ def test_shape(self): A = paddle.to_tensor(np.array([1.0, 2.0])) A_expect = paddle.to_tensor([[1.0, 0.0], [0.0, 2.0]]) A_actual = paddle.einsum('i->ii', A) - np.array_equal(A_expect.numpy(), A_actual.numpy()) + assert np.array_equal(A_expect.numpy(), A_actual.numpy()) class TestSimpleUndiagonal2(unittest.TestCase): @@ -594,7 +594,68 @@ def test_shape(self): B = paddle.to_tensor(np.array([1.0, 1.0])) A_expect = paddle.to_tensor([[2.0, 0.0], [0.0, 4.0]]) A_actual = paddle.einsum('i,j->ii', A, B) - np.array_equal(A_expect.numpy(), A_actual.numpy()) + assert np.array_equal(A_expect.numpy(), A_actual.numpy()) + + +class TestSimpleComplexGrad(unittest.TestCase): + """ + EinsumOp support complex grad. but op_test don't support numeric grad for complex dtype. + """ + + def test_shape(self): + paddle.disable_static() + A = paddle.to_tensor( + [ + [ + [-1.08644637 + 1.30794563j], + [-0.89606513 + 1.84546043j], + [-0.30629937 + 0.82911495j], + ], + [ + [-1.33993366 - 0.02329881j], + [-1.20658558 - 0.20856395j], + [-0.64172681 - 0.91661975j], + ], + ] + ) + + B = paddle.to_tensor( + [ + [[-1.07474258 + 0.39477287j], [-0.08614349 - 0.38770082j]], + [[1.17583854 + 0.58840176j], [-1.63509173 - 1.43329882j]], + [[1.228194 - 0.32357468j], [1.07638625 + 1.25298469j]], + ] + ) + + dOut = paddle.to_tensor( + [ + [[-0.73074259 - 0.1632133j], [1.42848507 - 0.96410727j]], + [[0.94465389 - 0.34264733j], [-0.26400278 + 0.04890404j]], + ] + ) + + d_expect = paddle.to_tensor( + [ + [ + [0.971658 + 1.100766j], + [-1.909121 + 3.861908j], + [-0.515092 - 3.264529j], + ], + [ + [-1.146746 - 0.111233j], + [1.270721 - 1.417091j], + [1.048197 + 0.268260j], + ], + ] + ) + + A.stop_gradient = False + B.stop_gradient = False + Out = paddle.einsum('iox,ojx->ijx', A, B) + dA = paddle.grad(Out, A, dOut)[0] + np.testing.assert_allclose( + dA.numpy(), d_expect.numpy(), rtol=1e-6, atol=0 + ) if __name__ == "__main__": From e12b6c04a4140995c6832f56c24a38a16e7b579c Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Tue, 1 Nov 2022 16:48:30 +0800 Subject: [PATCH 61/91] Support custom stream for standalone executor (#47411) * [Auto Parallel] Improve the c++ dist attr * [Auto Parallel] Modify test_program.py * Support custom stream for standalone executor Co-authored-by: Yulong Ao --- .../distributed/auto_parallel/dist_attr.cc | 12 +- .../distributed/auto_parallel/dist_attr.h | 9 ++ .../new_executor/interpreter/data_transfer.cc | 1 + .../interpreter/dependency_builder.cc | 7 +- .../interpreter/interpreter_util.cc | 116 +++++++++++------- .../interpreter/interpreter_util.h | 24 ++-- .../new_executor/new_executor_defs.h | 26 ++-- .../framework/new_executor/stream_analyzer.cc | 21 ++-- paddle/fluid/framework/op_desc.cc | 4 + paddle/fluid/framework/op_desc.h | 1 + paddle/fluid/pybind/auto_parallel_py.cc | 3 + .../fluid/tests/unittests/CMakeLists.txt | 2 +- .../CMakeLists.txt | 0 .../test_standalone_controlflow.py | 0 .../test_standalone_custom_stream.py | 83 +++++++++++++ .../test_standalone_executor.py | 0 .../test_standalone_multiply_write.py | 0 17 files changed, 224 insertions(+), 85 deletions(-) rename python/paddle/fluid/tests/unittests/{interpreter => standalone_executor}/CMakeLists.txt (100%) rename python/paddle/fluid/tests/unittests/{interpreter => standalone_executor}/test_standalone_controlflow.py (100%) create mode 100644 python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py rename python/paddle/fluid/tests/unittests/{interpreter => standalone_executor}/test_standalone_executor.py (100%) rename python/paddle/fluid/tests/unittests/{interpreter => standalone_executor}/test_standalone_multiply_write.py (100%) diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.cc b/paddle/fluid/distributed/auto_parallel/dist_attr.cc index 57a5b40768af5..5b97393864d74 100644 --- a/paddle/fluid/distributed/auto_parallel/dist_attr.cc +++ b/paddle/fluid/distributed/auto_parallel/dist_attr.cc @@ -319,7 +319,7 @@ bool operator==(const TensorDistAttr& lhs, const TensorDistAttr& rhs) { } std::vector OperatorDistAttr::fields_{ - "process_mesh", "impl_type", "impl_idx"}; + "process_mesh", "impl_type", "impl_idx", "execution_stream"}; OperatorDistAttr::OperatorDistAttr(const OpDesc& op) : op_(&op) { VLOG(4) << "[OperatorDistAttr constructor] op type: " << op_->Type(); @@ -376,8 +376,9 @@ void OperatorDistAttr::initialize() { output_dist_attrs_[name] = TensorDistAttr(*output); } } - impl_type_ = "default"; + impl_type_ = kDefault; impl_idx_ = 0; + execution_stream_ = kDefault; } void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) { @@ -386,9 +387,8 @@ void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) { set_process_mesh(dist_attr.process_mesh()); set_impl_type(dist_attr.impl_type()); set_impl_idx(dist_attr.impl_idx()); + set_execution_stream(dist_attr.execution_stream()); set_annotated(dist_attr.annotated()); - impl_type_ = dist_attr.impl_type(); - impl_idx_ = dist_attr.impl_idx(); } void OperatorDistAttr::set_input_dist_attrs( @@ -666,6 +666,7 @@ std::string OperatorDistAttr::to_string() const { } str += "impl_type: " + impl_type_ + ", "; str += "impl_idx: " + std::to_string(impl_idx_) + ", "; + str += "execution_stream: " + execution_stream_ + ", "; str += "annotated: [" + str_join(annotated_) + "], "; str += "\nprocess_mesh: " + process_mesh_.to_string() + ", "; str += "\ninput_dist_attrs: [\n"; @@ -747,6 +748,9 @@ bool operator==(const OperatorDistAttr& lhs, const OperatorDistAttr& rhs) { if (lhs.impl_idx() != rhs.impl_idx()) { return false; } + if (lhs.execution_stream() != rhs.execution_stream()) { + return false; + } for (auto const& item : lhs.input_dist_attrs()) { if (rhs.input_dist_attrs().count(item.first) != 1) { return false; diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.h b/paddle/fluid/distributed/auto_parallel/dist_attr.h index d4aa306e71273..61e61e2e53dd6 100644 --- a/paddle/fluid/distributed/auto_parallel/dist_attr.h +++ b/paddle/fluid/distributed/auto_parallel/dist_attr.h @@ -46,6 +46,8 @@ using framework::OpDesc; using framework::ProgramDesc; using framework::VarDesc; +constexpr const char* kDefault = "default"; + class TensorDistAttr { public: TensorDistAttr() = default; @@ -205,6 +207,12 @@ class OperatorDistAttr { void set_impl_idx(const int64_t& impl_idx) { impl_idx_ = impl_idx; } + const std::string& execution_stream() const { return execution_stream_; } + + void set_execution_stream(const std::string& execution_stream) { + execution_stream_ = execution_stream; + } + const std::map& annotated() const { return annotated_; } void set_annotated(const std::map& annotated); @@ -262,6 +270,7 @@ class OperatorDistAttr { ProcessMesh process_mesh_; std::string impl_type_; int64_t impl_idx_ = -1; + std::string execution_stream_; std::map annotated_; }; diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc index efe10fcd5f3a9..bf51ebd1d48d7 100644 --- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 3b2a2aed7f367..ae7d7e42536ff 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -151,9 +151,8 @@ void DependencyBuilder::AddDependencyForCoalesceTensorOp() { // 'first_read_fused_out_op' size_t target = first_read_fused_out_op; for (size_t j = first_read_fused_out_op + 1; j < op_num_; ++j) { - if (j == target + 1 && - IsCommunicationOp(instructions_->at(target).OpBase()->Type()) && - IsCommunicationOp(instructions_->at(j).OpBase()->Type())) { + if (j == target + 1 && IsCommunicationOp(instructions_->at(target)) && + IsCommunicationOp(instructions_->at(j))) { VLOG(4) << "Found consecutive communication ops, " << instructions_->at(target).OpBase()->Type() << " -> " << instructions_->at(j).OpBase()->Type(); @@ -174,7 +173,7 @@ void DependencyBuilder::AddDependencyForCoalesceTensorOp() { void DependencyBuilder::AddDependencyForCommunicationOp() { int dependence_op_idx = -1; for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { - if (IsCommunicationOp(instructions_->at(op_idx).OpBase()->Type())) { + if (IsCommunicationOp(instructions_->at(op_idx))) { if (dependence_op_idx != -1) { AddDownstreamOp(dependence_op_idx, op_idx); } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index ae646ed42dbcc..104217fa80f22 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -16,6 +16,7 @@ #include +#include "paddle/fluid/distributed/auto_parallel/dist_attr.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h" @@ -125,18 +126,60 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type, } } -void LogDeviceMemoryStats(const platform::Place& place) { - if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) { - VLOG(0) << "memory_allocated: " - << static_cast(memory::DeviceMemoryStatCurrentValue( - "Allocated", place.device)) / - 1024 / 1024 - << " MB"; - VLOG(0) << "max_memory_allocated: " - << static_cast(memory::DeviceMemoryStatPeakValue( - "Allocated", place.device)) / - 1024 / 1024 - << " MB"; +bool IsCommunicationOp(const Instruction& instr) { + const std::set special_comm_op_set = { + "send", + "recv", + "send_v2", + "recv_v2", + }; + const std::string& op_name = instr.OpBase()->Type(); + const std::string communication_op_prefix = "c_"; + if (op_name.find(communication_op_prefix) != std::string::npos || + special_comm_op_set.count(op_name)) { + return true; + } + return false; +} + +bool IsCpuOp(const Instruction& instr) { + return platform::is_cpu_place(instr.DeviceContext().GetPlace()); +} + +bool IsSupportedHeterPlace(const phi::Place& place) { + return platform::is_gpu_place(place) || platform::is_npu_place(place) || + platform::is_xpu_place(place) || platform::is_ipu_place(place) || + platform::is_custom_place(place); +} + +bool IsMemcpyD2H(const Instruction& instr) { + return instr.OpBase()->Type() == kMemcpyD2H; +} + +bool IsMemcpyH2D(const Instruction& instr) { + return instr.OpBase()->Type() == kMemcpyH2D; +} + +bool IsMemcpyOp(const Instruction& instr) { + return IsMemcpyD2H(instr) || IsMemcpyH2D(instr); +} + +void AddFetch(const std::vector& fetch_names, + framework::BlockDesc* block) { + auto* fetch_holder = block->Var(kFetchVarName); + fetch_holder->SetType(proto::VarType::FETCH_LIST); + fetch_holder->SetPersistable(true); + + int i = 0; + for (auto& fetch_name : fetch_names) { + // append fetch op + auto* op = block->AppendOp(); + op->SetType("fetch_v2"); + op->SetInput("X", {fetch_name}); + op->SetOutput("Out", {kFetchVarName}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + i++; } } @@ -517,6 +560,12 @@ void BuildOpFuncList(const platform::Place& place, op_func_node.input_index = ins_name2id; op_func_node.output_index = outs_name2id; + const OperatorDistAttr* dist_attr = block.Op(i)->DistAttr(); + if (dist_attr && + dist_attr->execution_stream() != distributed::auto_parallel::kDefault) { + op_func_node.execution_stream_ = dist_attr->execution_stream(); + } + SingleStreamGuard single_stream_guard(ops[i]); VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); @@ -748,38 +797,19 @@ void BuildOpFuncList(const platform::Place& place, memory::Release(place); } -void AddFetch(const std::vector& fetch_names, - framework::BlockDesc* block) { - auto* fetch_holder = block->Var(kFetchVarName); - fetch_holder->SetType(proto::VarType::FETCH_LIST); - fetch_holder->SetPersistable(true); - - int i = 0; - for (auto& fetch_name : fetch_names) { - // append fetch op - auto* op = block->AppendOp(); - op->SetType("fetch_v2"); - op->SetInput("X", {fetch_name}); - op->SetOutput("Out", {kFetchVarName}); - op->SetAttr("col", {static_cast(i)}); - op->CheckAttrs(); - i++; - } -} - -bool IsCommunicationOp(const std::string& op_name) { - const std::set special_comm_op_set = { - "send", - "recv", - "send_v2", - "recv_v2", - }; - const std::string communication_op_prefix = "c_"; - if (op_name.find(communication_op_prefix) != std::string::npos || - special_comm_op_set.count(op_name)) { - return true; +void LogDeviceMemoryStats(const platform::Place& place) { + if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) { + VLOG(0) << "memory_allocated: " + << static_cast(memory::DeviceMemoryStatCurrentValue( + "Allocated", place.device)) / + 1024 / 1024 + << " MB"; + VLOG(0) << "max_memory_allocated: " + << static_cast(memory::DeviceMemoryStatPeakValue( + "Allocated", place.device)) / + 1024 / 1024 + << " MB"; } - return false; } } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index 52163c64f7ea8..b842d3acfde6d 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -65,11 +65,20 @@ class AsyncWorkQueue { std::unique_ptr queue_group_; }; -void LogDeviceMemoryStats(const platform::Place& place); +bool IsCommunicationOp(const Instruction& instr); -void BuildVariableScope(const framework::BlockDesc& block, - VariableScope* var_scope, - bool use_local_scope = true); +bool IsCpuOp(const Instruction& instr); + +bool IsMemcpyD2H(const Instruction& instr); + +bool IsMemcpyH2D(const Instruction& instr); + +bool IsMemcpyOp(const Instruction& instr); + +bool IsSupportedHeterPlace(const phi::Place& place); + +void AddFetch(const std::vector& fetch_names, + framework::BlockDesc* block); void BuildOpFuncList(const platform::Place& place, const framework::BlockDesc& block, @@ -79,10 +88,11 @@ void BuildOpFuncList(const platform::Place& place, const ExecutionConfig& execution_config, bool use_local_scope = true); -void AddFetch(const std::vector& fetch_names, - framework::BlockDesc* block); +void BuildVariableScope(const framework::BlockDesc& block, + VariableScope* var_scope, + bool use_local_scope = true); -bool IsCommunicationOp(const std::string& op_name); +void LogDeviceMemoryStats(const platform::Place& place); } // namespace interpreter } // namespace framework diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 6f2287a896645..6735e891230d7 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -34,6 +34,12 @@ using OpKernelComputeFunc = std::function; constexpr int kEmptyVarIndex = 0; +// stream types +constexpr const char* kCustomStream = "CustromStream"; +constexpr const char* kDefaultStream = "DefaultStream"; +constexpr const char* kD2HStream = "D2HStream"; +constexpr const char* kH2DStream = "H2DStream"; + class InterpretercoreInferShapeContext : public InferShapeContext { public: InterpretercoreInferShapeContext(const OperatorBase& op, @@ -274,6 +280,7 @@ class RuntimeInferShapeContext; struct OpFuncNode { // TODO(zhiqiu): Better make it unique_ptr std::shared_ptr operator_base_; + std::string execution_stream_{kDefaultStream}; std::map> input_index; std::map> output_index; std::unordered_set no_data_transform_index; @@ -379,25 +386,6 @@ static constexpr char kMemcpyH2D[] = "memcpy_h2d"; static constexpr char kMemcpyD2H[] = "memcpy_d2h"; static constexpr char kFetchVarName[] = "fetch"; -static bool IsMemcpyH2D(const Instruction& instr) { - return instr.OpBase()->Type() == kMemcpyH2D; -} - -static bool IsMemcpyD2H(const Instruction& instr) { - return instr.OpBase()->Type() == kMemcpyD2H; -} - -static bool IsCpuOp(const Instruction& instr) { - return platform::is_cpu_place(instr.DeviceContext().GetPlace()); -} - -// is supported heterogeneous place -static bool IsSupportedHeterPlace(const phi::Place& place) { - return platform::is_gpu_place(place) || platform::is_npu_place(place) || - platform::is_xpu_place(place) || platform::is_ipu_place(place) || - platform::is_custom_place(place); -} - // static_ref_ is the numer of last live ops calculated to statically after // `build` the Instructions. dynamic_ref_ is the runtime version ref which will // be decreased by one dynamiclly after the execution of an op (in last ops diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index 09c54a6480516..8ee82699b4750 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -24,10 +24,6 @@ namespace paddle { namespace framework { -// stream types -constexpr const char* kD2HStream = "D2HStream"; -constexpr const char* kH2DStream = "H2DStream"; - class ContextManager { public: using DeviceContextMap = @@ -94,13 +90,14 @@ std::vector StreamAnalyzer::GetNeedEventVarIds( return false; }; - bool is_comm = interpreter::IsCommunicationOp(cur_instr.OpBase()->Type()) || - interpreter::IsCommunicationOp(next_instr.OpBase()->Type()); + bool is_memcpy = + interpreter::IsMemcpyOp(cur_instr) || interpreter::IsMemcpyOp(next_instr); + std::vector need_event_var_ids; for (auto& item : next_instr.Inputs()) { for (auto var_id : item.second) { if (unique_var_ids.count(var_id) > 0) { - if (!is_comm) { + if (is_memcpy) { if (next_instr.NoDataTransformVars().count(var_id)) { VLOG(4) << "Skip inserting event at variable " << item.first << " of operator " << next_instr.OpBase()->Type() @@ -186,12 +183,22 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext( const OpFuncNode& op_func_node) { auto& op = op_func_node.operator_base_; auto& op_type = op->Type(); + const std::string& execution_stream = op_func_node.execution_stream_; ContextManager& ctx_manager = ContextManager::Instance(); // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is // synchronous. if (platform::is_gpu_place(place_) || platform::is_npu_place(place_) || platform::is_custom_place(place_)) { + VLOG(7) << "Parse DeviceContext for " << op_type + << ", execution stream = " << execution_stream; + if (execution_stream != kDefaultStream) { + return ctx_manager + .Get(std::string(kCustomStream) + "-" + execution_stream, place_) + .get() + .get(); + } + if (op_type == interpreter::kMemcpyD2H) { return ctx_manager.Get(std::string(kD2HStream), place_).get().get(); } else if (op_type == interpreter::kMemcpyH2D) { diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 321230e86064b..dcc47058b6414 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -1105,6 +1105,10 @@ void OpDesc::InferVarType(BlockDesc *block) const { } } +const OperatorDistAttr *OpDesc::DistAttr() const { + return dist_attr_ ? dist_attr_.get() : nullptr; +} + OperatorDistAttr *OpDesc::MutableDistAttr() { if (dist_attr_) { return dist_attr_.get(); diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 7987a9ded475c..6c6f13d7c929b 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -196,6 +196,7 @@ class OpDesc { uint64_t Id() const { return id_; } uint64_t OriginalId() const { return original_id_; } void SetOriginalId(uint64_t original_id) { original_id_ = original_id; } + const OperatorDistAttr *DistAttr() const; OperatorDistAttr *MutableDistAttr(); void SetDistAttr(const OperatorDistAttr &dist_attr); diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc index 1e0bda0c9401f..089f5da5abcda 100644 --- a/paddle/fluid/pybind/auto_parallel_py.cc +++ b/paddle/fluid/pybind/auto_parallel_py.cc @@ -215,6 +215,9 @@ void BindAutoParallel(py::module *m) { .def_property("impl_idx", &OperatorDistAttr::impl_idx, &OperatorDistAttr::set_impl_idx) + .def_property("execution_stream", + &OperatorDistAttr::execution_stream, + &OperatorDistAttr::set_execution_stream) .def_property("annotated", &OperatorDistAttr::annotated, &OperatorDistAttr::set_annotated) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a206e8994e594..d1eaebcdc2e65 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -893,7 +893,7 @@ add_subdirectory(asp) add_subdirectory(ir) -add_subdirectory(interpreter) +add_subdirectory(standalone_executor) if(WITH_TESTING) set_property(TEST test_parallel_executor_mnist diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt similarity index 100% rename from python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt rename to python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_controlflow.py similarity index 100% rename from python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py rename to python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_controlflow.py diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py new file mode 100644 index 0000000000000..3915b2459e083 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.fluid import core +from test_standalone_executor import build_program + +paddle.enable_static() + + +class TestCustomStream(unittest.TestCase): + def setUp(self): + self.steps = 3 + + ### + ### fill_constant(cpu) gaussian_random + ### | | | | + ### | | matmul_v2(s1) fill_constant + ### | | | | | + ### | | elementwise_add(s1) | + ### | | | | + ### | elementwise_sub(cpu) | + ### | | | | + ### | tanh(cpu) elementwise_add(s2) + ### | | | + ### elementwise_sub(s1) tanh(s2) + ### | | + ### elementwise_add(s2) + ### | + ### reduce_mean(s2) + ### + def set_custom_stream(self, prog): + op_index_for_stream1 = [2, 4, 9] + op_index_for_stream2 = [7, 8, 10, 11] + ops = prog.global_block().ops + for op_index in op_index_for_stream1: + ops[op_index].dist_attr.execution_stream = "s1" + for op_index in op_index_for_stream2: + ops[op_index].dist_attr.execution_stream = "s2" + + def run_program(self, apply_custom_stream=False): + paddle.seed(2022) + main_program, startup_program, fetch_list = build_program() + self.assertEqual(len(startup_program.global_block().ops), 0) + + if apply_custom_stream: + self.set_custom_stream(main_program) + + with paddle.static.program_guard(main_program, startup_program): + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + scope = core.Scope() + outs = [] + for i in range(self.steps): + outs.append( + exe.run(main_program, scope=scope, fetch_list=fetch_list) + ) + return outs + + def test_result(self): + if not core.is_compiled_with_cuda(): + return + + baselines = self.run_program() + outs = self.run_program(apply_custom_stream=True) + for bl, out in zip(baselines, outs): + self.assertEqual(bl[0], out[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_executor.py similarity index 100% rename from python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py rename to python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_executor.py diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_multiply_write.py similarity index 100% rename from python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py rename to python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_multiply_write.py From 5a2ab683e28359145a5f938fabb78b3f80c53a68 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 1 Nov 2022 17:12:28 +0800 Subject: [PATCH 62/91] [CodeStyle][E712] use `if cond`/`if cond is True` for comparison with `True` (#47464) * [CodeStyle][E712] use `if cond`/`if cond is True` for comparison with `True` * revert changes in fluid * revert unrelated file * revert changes in norm * revert changes in auto_parallel_amp * fix norm and auto_parallel_amp * revert a typo fix due to fixed at #47477 --- .../generator/python_c_gen.py | 2 +- .../api/full_ILSVRC2012_val_preprocess.py | 2 +- python/paddle/audio/backends/init_backend.py | 2 +- python/paddle/dataset/conll05.py | 4 +- .../auto_parallel/process_group.py | 2 +- .../auto_parallel/tuner/parallel_tuner.py | 2 +- .../distributed/fleet/dataset/dataset.py | 2 +- python/paddle/distributed/fleet/fleet.py | 2 +- python/paddle/distributed/fleet/launch.py | 2 +- .../paddle/distributed/fleet/launch_utils.py | 2 +- .../distributed/fleet/layers/mpu/mp_ops.py | 2 +- .../gradient_merge_optimizer.py | 2 +- .../graph_execution_optimizer.py | 2 +- .../meta_optimizers/pipeline_optimizer.py | 2 +- .../meta_optimizers/raw_program_optimizer.py | 2 +- .../meta_optimizers/recompute_optimizer.py | 2 +- .../fleet/meta_optimizers/sharding/utils.py | 8 ++-- .../tensor_parallel_optimizer.py | 2 +- .../parallel_layers/pp_layers.py | 2 +- python/paddle/distributed/fleet/model.py | 4 +- python/paddle/distributed/fleet/optimizer.py | 2 +- .../distributed/fleet/recompute/recompute.py | 2 +- .../distributed/fleet/runtime/the_one_ps.py | 2 +- .../fleet/utils/hybrid_parallel_inference.py | 2 +- .../paddle/distributed/fleet/utils/ps_util.py | 2 +- .../distributed/passes/auto_parallel_amp.py | 16 ++++---- .../distributed/passes/auto_parallel_fp16.py | 19 ++++----- .../distributed/passes/ps_server_pass.py | 2 +- .../distributed/passes/ps_trainer_pass.py | 2 +- python/paddle/distributed/ps/the_one_ps.py | 12 +++--- python/paddle/distributed/ps/utils/public.py | 2 +- .../fluid/tests/book/test_recognize_digits.py | 2 +- .../unittests/auto_parallel/test_to_static.py | 4 +- .../test_autograd_functional_dynamic.py | 2 +- .../test_fleet_sharding_meta_optimizer.py | 24 +++--------- .../test_imperative_auto_mixed_precision.py | 6 +-- ...perative_auto_mixed_precision_for_eager.py | 6 +-- .../dygraph_to_static/test_cycle_gan.py | 4 +- .../dygraph_to_static/test_tensor_methods.py | 2 +- .../ir/inference/test_conv_bn_fuse_pass.py | 6 +-- .../test_emb_eltwise_layernorm_fuse_pass.py | 4 +- ...st_mkldnn_conv_affine_channel_fuse_pass.py | 6 +-- .../ir/inference/test_trt_convert_concat.py | 2 +- .../ir/inference/test_trt_convert_dropout.py | 2 +- .../ir/inference/test_trt_convert_gather.py | 4 +- .../ir/inference/test_trt_convert_gelu.py | 2 +- .../test_trt_convert_nearest_interp.py | 2 +- .../ir/inference/test_trt_convert_pool2d.py | 10 ++--- .../inference/test_trt_convert_roi_align.py | 4 +- .../test_trt_convert_shuffle_channel.py | 2 +- .../test_trt_convert_skip_layernorm.py | 2 +- .../ir/inference/test_trt_convert_stack.py | 2 +- .../ir/inference/test_trt_convert_tile.py | 2 +- .../ir/inference/test_trt_convert_top_k_v2.py | 2 +- .../inference/test_trt_convert_transpose.py | 2 +- .../ir/inference/test_trt_convert_yolo_box.py | 4 +- .../mkldnn/test_elementwise_add_mkldnn_op.py | 6 +-- .../mkldnn/test_elementwise_mul_mkldnn_op.py | 6 +-- ...st_onnx_format_quantization_mobilenetv1.py | 2 +- .../paddle/fluid/tests/unittests/op_test.py | 35 ++++++++--------- .../fluid/tests/unittests/op_test_xpu.py | 8 ++-- .../tests/unittests/ps/ps_dnn_trainer.py | 2 +- .../fluid/tests/unittests/test_adam_op.py | 4 +- .../fluid/tests/unittests/test_adamw_op.py | 4 +- .../test_async_ssa_graph_executor_mnist.py | 4 +- .../tests/unittests/test_batch_norm_op.py | 2 +- .../tests/unittests/test_batch_norm_op_v2.py | 2 +- .../tests/unittests/test_box_coder_op.py | 8 ++-- .../fluid/tests/unittests/test_center_loss.py | 2 +- .../tests/unittests/test_compare_reduce_op.py | 2 +- .../fluid/tests/unittests/test_conv2d_op.py | 25 ++++++------ .../unittests/test_conv2d_transpose_op.py | 8 ++-- .../fluid/tests/unittests/test_conv3d_op.py | 8 ++-- .../tests/unittests/test_dataset_download.py | 2 +- .../fluid/tests/unittests/test_dist_base.py | 12 +++--- .../unittests/test_elementwise_add_op.py | 12 +++--- .../unittests/test_elementwise_mul_op.py | 10 ++--- .../tests/unittests/test_empty_like_op.py | 4 +- .../fluid/tests/unittests/test_empty_op.py | 12 +++--- .../test_imperative_layer_trainable.py | 8 ++-- .../fluid/tests/unittests/test_mean_op.py | 4 +- .../fluid/tests/unittests/test_momentum_op.py | 4 +- .../tests/unittests/test_multiclass_nms_op.py | 12 ++---- .../fluid/tests/unittests/test_ops_nms.py | 2 +- .../fluid/tests/unittests/test_optimizer.py | 2 +- .../test_parallel_executor_drop_scope.py | 4 +- .../fluid/tests/unittests/test_pool2d_op.py | 18 ++++----- .../fluid/tests/unittests/test_pool3d_op.py | 2 +- .../fluid/tests/unittests/test_sgd_op.py | 8 ++-- .../fluid/tests/unittests/test_softmax_op.py | 14 +++---- .../test_softmax_with_cross_entropy_op.py | 2 +- .../unittests/test_sparse_attention_op.py | 10 ++--- .../fluid/tests/unittests/test_var_base.py | 8 ++-- .../fluid/tests/unittests/test_where_op.py | 4 +- .../unittests/xpu/test_batch_norm_op_xpu.py | 2 +- .../tests/unittests/xpu/test_conv2d_op_xpu.py | 30 +++----------- .../unittests/xpu/test_dropout_op_xpu.py | 4 +- .../xpu/test_elementwise_mul_op_xpu.py | 6 +-- .../tests/unittests/xpu/test_empty_op_xpu.py | 4 +- .../xpu/test_fused_gemm_epilogue_op_xpu.py | 4 +- .../tests/unittests/xpu/test_matmul_op_xpu.py | 8 ++-- .../unittests/xpu/test_matmul_v2_op_xpu.py | 2 +- .../tests/unittests/xpu/test_pool2d_op_xpu.py | 2 +- .../tests/unittests/xpu/test_where_op_xpu.py | 4 +- python/paddle/hapi/model_summary.py | 2 +- python/paddle/incubate/autograd/primrules.py | 4 +- python/paddle/nn/functional/common.py | 4 +- python/paddle/nn/functional/loss.py | 20 +++++----- python/paddle/nn/functional/pooling.py | 2 +- python/paddle/nn/layer/distance.py | 2 +- python/paddle/nn/layer/norm.py | 17 ++++---- python/paddle/nn/layer/rnn.py | 6 +-- python/paddle/nn/quant/quant_layers.py | 2 +- python/paddle/profiler/profiler_statistic.py | 4 +- python/paddle/profiler/utils.py | 4 +- python/paddle/sparse/nn/layer/norm.py | 5 ++- python/paddle/tensor/linalg.py | 10 ++--- python/paddle/tensor/manipulation.py | 8 ++-- python/paddle/tensor/random.py | 2 +- python/paddle/text/datasets/conll05.py | 4 +- tools/analysisPyXml.py | 39 +++++++++---------- tools/check_op_benchmark_result.py | 2 +- tools/get_pr_ut.py | 4 +- tools/get_single_test_cov.py | 2 +- ...rate_pd_op_dialect_from_paddle_op_maker.py | 32 +++++++-------- tools/sampcd_processor.py | 4 +- 126 files changed, 347 insertions(+), 401 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index dc38d3d46b293..0ceff35360868 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -532,7 +532,7 @@ def GeneratePythonCFunctions(self): ) status = f_generator.run() - if status == True: + if status: self.python_c_functions_str += ( f_generator.python_c_function_str + "\n" ) diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py index 219ce72077cd5..a5ff3717199b0 100644 --- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py +++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py @@ -45,7 +45,7 @@ def resize_short(img, target_size): def crop_image(img, target_size, center): width, height = img.size size = target_size - if center == True: + if center: w_start = (width - size) // 2 h_start = (height - size) // 2 else: diff --git a/python/paddle/audio/backends/init_backend.py b/python/paddle/audio/backends/init_backend.py index 3ca77ba316f98..6bf972d435f88 100644 --- a/python/paddle/audio/backends/init_backend.py +++ b/python/paddle/audio/backends/init_backend.py @@ -79,7 +79,7 @@ def list_available_backends() -> List[str]: if "paddleaudio" in sys.modules: version = paddleaudio.__version__ - if _check_version(version) == False: + if not _check_version(version): err_msg = ( "the version of paddleaudio installed is {},\n" "please ensure the paddleaudio >= 1.0.2." diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py index 22038594f60c9..08a383badae5d 100644 --- a/python/paddle/dataset/conll05.py +++ b/python/paddle/dataset/conll05.py @@ -109,9 +109,9 @@ def reader(): lbl_seq = [] verb_word = '' for l in lbl: - if l == '*' and is_in_bracket == False: + if l == '*' and not is_in_bracket: lbl_seq.append('O') - elif l == '*' and is_in_bracket == True: + elif l == '*' and is_in_bracket: lbl_seq.append('I-' + cur_tag) elif l == '*)': lbl_seq.append('I-' + cur_tag) diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 9883f116f4eea..10d2556f299ce 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -106,7 +106,7 @@ def add_ranks(self, new_ranks): return else: assert ( - self.is_instantiate() == False + not self.is_instantiate() ), "Cannot add new ranks after instantiating the process group" self._ranks.extend(new_ranks) self._ranks = sorted(list(set(self.ranks))) diff --git a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py index 0ed03defbed86..9f31766f19f2f 100644 --- a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py +++ b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py @@ -268,7 +268,7 @@ def _generate_dims_mapping_candidates_helper( return for idx, dim in enumerate(dims_list): - if visited[idx] == False: + if not visited[idx]: dims_mapping[start] = dim visited[idx] = True self._generate_dims_mapping_candidates_helper( diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py index b74f700391cf5..0217e012579cd 100755 --- a/python/paddle/distributed/fleet/dataset/dataset.py +++ b/python/paddle/distributed/fleet/dataset/dataset.py @@ -514,7 +514,7 @@ def update_settings(self, **kwargs): self._set_fleet_send_batch_size(kwargs[key]) elif key == "fleet_send_sleep_seconds": self._set_fleet_send_sleep_seconds(kwargs[key]) - elif key == "fea_eval" and kwargs[key] == True: + elif key == "fea_eval" and kwargs[key]: candidate_size = kwargs.get("candidate_size", 10000) self._set_fea_eval(candidate_size, True) diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index db3aae28a2018..2630fa8283eed 100644 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -303,7 +303,7 @@ def init( paddle.distributed.init_parallel_env() # hybrid parallel not support for npu/xpu - if self._user_defined_strategy.heter_ccl_mode == False: + if not self._user_defined_strategy.heter_ccl_mode: # init hybrid parallel environment in dygraph if tp._HYBRID_PARALLEL_GROUP is None: self._init_hybrid_parallel_env() diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index ca301c52a7f25..998f64c3ec293 100755 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -369,7 +369,7 @@ def get_cluster_info(args): if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') # auto mapping between processes and devices for auto-parallel - if args.enable_auto_mapping == True: + if args.enable_auto_mapping: assert ( args.cluster_topo_path is not None ), "The cluster topology must be provied when enabling auto mapping." diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 4ec2aa07787cb..d4b6b86119fa3 100755 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -1582,7 +1582,7 @@ def get_role_endpoints(self, args): x.strip().split(":")[0] for x in self.worker_endpoints.split(",") ] - if self.with_coordinator == True: + if self.with_coordinator: self.coordinator_endpoints_ips = [ x.strip().split(":")[0] for x in self.coordinator_endpoints.split(",") diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py index b1627d5a3b79c..04c4272ee0eec 100644 --- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py +++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py @@ -582,7 +582,7 @@ def _parallel_linear( # set is_distributed for splited bias # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank. # if a linear layer is splited by col, the bias would also be split into each rank as its weight - if axis == 1 and linear._bias_attr != False: + if axis == 1 and linear._bias_attr is not False: _set_var_distributed(linear.bias) if not gather_out: diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py index eb97122587f36..9a2fb12799985 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -53,7 +53,7 @@ def _can_apply(self): return False can_apply = ( - self.user_defined_strategy.gradient_merge == True + self.user_defined_strategy.gradient_merge ) and self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1 return can_apply diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index dd2ccfc7ff7d2..a1a33992d5946 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -177,7 +177,7 @@ def _try_to_compile(self, startup_program, main_program, loss): gradient_scale_configs['scale_strategy'] ] - if self.user_defined_strategy.recompute == True: + if self.user_defined_strategy.recompute: logging.warn( "set enable_sequential_execution=True since you have enable the recompute strategy" ) diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index dfb8fe5b22438..dfadeff3807e2 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -66,7 +66,7 @@ def _can_apply(self): if self.use_sharding: return False - if self.user_defined_strategy.pipeline == True: + if self.user_defined_strategy.pipeline: return True return False diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index 6ae89a9754ebd..53972452d80fa 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -65,7 +65,7 @@ def _can_apply(self): if not self.role_maker._is_collective: return False - if self.without_graph_optimization == True: + if self.without_graph_optimization: return True return False diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py index 524c3a123abc0..7a817b6fd04e0 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py @@ -55,7 +55,7 @@ def _can_apply(self): if not self.role_maker._is_collective: return False - if self.user_defined_strategy.recompute == True: + if self.user_defined_strategy.recompute: if ( len(self.user_defined_strategy.recompute_configs["checkpoints"]) == 0 diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 1ec9457854dce..ea42130300f11 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -38,7 +38,7 @@ def check_broadcast(block): broadcast_vars = {} for idx, op in enumerate(block.ops): if op.type == "c_broadcast": - if op.all_attrs()["use_calc_stream"] == False: + if not op.all_attrs()["use_calc_stream"]: var_name = op.desc.input_arg_names()[0] if "@BroadCast" in var_name: if var_name in broadcast_vars: @@ -72,7 +72,7 @@ def check_broadcast(block): last_sync_calc_op_idx = idx continue if op.type == "c_broadcast": - if op.all_attrs()["use_calc_stream"] == False: + if not op.all_attrs()["use_calc_stream"]: var_name = op.desc.input_arg_names()[0] if "@BroadCast" in var_name: if broadcast_vars[var_name]["fill_constant_pos"] != -1: @@ -117,7 +117,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1): for idx, op in enumerate(block.ops): # sharding use both allreduce and reduce to sync grad if op.type == "c_allreduce_sum" or op.type == "c_reduce_sum": - if op.all_attrs()["use_calc_stream"] == False: + if not op.all_attrs()["use_calc_stream"]: ring_id = op.desc.attr("ring_id") var_name = op.desc.input_arg_names()[0] param = var_name.split("@")[0] @@ -153,7 +153,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1): dp_grads_status[var_name] = 1 # check sharding allreduce and reduce but skip megatron allreduce elif op.type == "c_allreduce_sum" or op.type == "c_reduce_sum": - if op.all_attrs()["use_calc_stream"] == False: + if not op.all_attrs()["use_calc_stream"]: var_name = op.desc.input_arg_names()[0] ring_id = op.desc.attr("ring_id") if ring_id == sharding_ring_id: diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py index 0cd86ad08bde2..8f2e113b52e17 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py @@ -57,7 +57,7 @@ def _can_apply(self): if not self.role_maker._is_collective: return False - if self.user_defined_strategy.tensor_parallel == True: + if self.user_defined_strategy.tensor_parallel: return True return False diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index fec5005627b88..29cbe0d9dcac2 100755 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -720,7 +720,7 @@ def forward(self, input, chunk_id=None): def _need_recompute(self, funcs, inputs): if not any( - input_.stop_gradient == False + not input_.stop_gradient for input_ in inputs if isinstance(input_, paddle.Tensor) ): diff --git a/python/paddle/distributed/fleet/model.py b/python/paddle/distributed/fleet/model.py index 21e6d07ad55e3..a132860aac162 100644 --- a/python/paddle/distributed/fleet/model.py +++ b/python/paddle/distributed/fleet/model.py @@ -90,7 +90,7 @@ def forward(self, x): amp_enable = False strategy = fleet_env._user_defined_strategy - if strategy.amp == True: + if strategy.amp: amp_enable = True amp_level = "O2" if strategy.amp_configs['use_pure_fp16'] else "O1" if amp_level.upper() == "O2": @@ -122,7 +122,7 @@ def forward(self, x): use_dynamic_loss_scaling=use_dynamic_loss_scaling, ) - if strategy.heter_ccl_mode == True: + if strategy.heter_ccl_mode: distributed_model = paddle.DataParallel( model, comm_buffer_size=strategy.fuse_grad_size_in_MB, diff --git a/python/paddle/distributed/fleet/optimizer.py b/python/paddle/distributed/fleet/optimizer.py index 37a3a896f6b05..f67c108486a9b 100644 --- a/python/paddle/distributed/fleet/optimizer.py +++ b/python/paddle/distributed/fleet/optimizer.py @@ -59,7 +59,7 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None): fleet_env._context = {} if fleet_env.worker_num() > 1: - if fleet_env._user_defined_strategy.heter_ccl_mode == False: + if not fleet_env._user_defined_strategy.heter_ccl_mode: return HybridParallelOptimizer( optimizer, fleet_env._hcg, fleet_env._user_defined_strategy ) diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py index 2657c60f02c9a..fd8cffdff00a8 100755 --- a/python/paddle/distributed/fleet/recompute/recompute.py +++ b/python/paddle/distributed/fleet/recompute/recompute.py @@ -41,7 +41,7 @@ def detach_variable(inputs): def check_recompute_necessary(inputs): if not any( - input_.stop_gradient == False + not input_.stop_gradient for input_ in inputs if isinstance(input_, (core.eager.Tensor, paddle.Tensor)) ): diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index 5a0be9a1e018f..7de34aa6e1c85 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -337,7 +337,7 @@ def parse_by_optimizer( self.table_num = size self.table_dim = single_dim - if oop.type != 'adam' and adam_d2sum == True: + if oop.type != 'adam' and adam_d2sum: print('optimization algorithm is not adam, set adam_d2sum False') adam_d2sum = False print("adam_d2sum:", adam_d2sum) diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py index 0c5bff02ed820..6dd100a6f9e70 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py @@ -231,7 +231,7 @@ def __init__( ) else: if isinstance(role_maker, fleet.base.role_maker.RoleMakerBase): - assert role_maker._is_collective == True + assert role_maker._is_collective self.role_maker = role_maker # communication_group info diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py index 9b079d64bb530..d283dbe1fe8b6 100644 --- a/python/paddle/distributed/fleet/utils/ps_util.py +++ b/python/paddle/distributed/fleet/utils/ps_util.py @@ -210,7 +210,7 @@ def dag_check_up_and_reorder(program, inputs, outputs): if found: break if found: - if output_indexes[j] == True: + if output_indexes[j]: warnings.warn( "unable to re-arrange dags order to combine distributed embedding ops" ) diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py index 3305982f5055c..c8932069a794e 100644 --- a/python/paddle/distributed/passes/auto_parallel_amp.py +++ b/python/paddle/distributed/passes/auto_parallel_amp.py @@ -80,9 +80,9 @@ def _build_state(self, amp_lists, dist_context): fwd_op_id = dist_op_context.grad_op_id_to_op_id[ op.desc.original_id() ] - if self._is_fp16_op(fwd_op_id) == True: + if self._is_fp16_op(fwd_op_id) is True: self._op_fp16_dict[op.desc.original_id()] = True - elif self._is_fp16_op(fwd_op_id) == False: + elif self._is_fp16_op(fwd_op_id) is False: self._op_fp16_dict[op.desc.original_id()] = False elif int(op.attr('op_role')) == int(OpRole.Optimize): break @@ -132,13 +132,13 @@ def _mark_black_white_ops(self, amp_lists): # if it's one of inputs if ( self._is_fp16_op(prev_op.desc.original_id()) - == False + is False or prev_op.type in amp_lists.black_list ): is_black_op = True elif ( self._is_fp16_op(prev_op.desc.original_id()) - == True + is True or prev_op.type in amp_lists.white_list ): is_white_op = True @@ -161,7 +161,7 @@ def cast_forward_program(self, dist_context): num_cast_ops = 0 if int(op.attr('op_role')) == int(OpRole.Backward): break - if self._is_fp16_op(op.desc.original_id()) == False: + if self._is_fp16_op(op.desc.original_id()) is False: num_cast_ops = self._insert_cast_op_forward( op, idx, @@ -169,7 +169,7 @@ def cast_forward_program(self, dist_context): core.VarDesc.VarType.FP32, dist_context, ) - elif self._is_fp16_op(op.desc.original_id()) == True: + elif self._is_fp16_op(op.desc.original_id()) is True: num_cast_ops = self._insert_cast_op_forward( op, idx, @@ -302,7 +302,7 @@ def cast_backward_program(self, params_grads, dist_context): grad_op_orig_id = grad_op.desc.original_id() dist_op_context = dist_context.dist_op_context if grad_op_orig_id in dist_op_context.grad_op_id_to_op_id: - if self._is_fp16_op(grad_op_orig_id) == False: # fp32 + if self._is_fp16_op(grad_op_orig_id) is False: # fp32 num_cast_ops = self._insert_cast_op_backward( grad_op, idx, @@ -311,7 +311,7 @@ def cast_backward_program(self, params_grads, dist_context): dist_context, appended_grad_times, ) - elif self._is_fp16_op(grad_op_orig_id) == True: # fp16 + elif self._is_fp16_op(grad_op_orig_id) is True: # fp16 num_cast_ops = self._insert_cast_op_backward( grad_op, idx, diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index cf1b2d45290bd..8ad8b2a8fad41 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -235,10 +235,7 @@ def resolute_tensor_dtype(self, block): for op in block.ops: if is_forward_op(op): # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python - if ( - self._is_fp16_op(op.desc.original_id()) == True - or op.type == "cast" - ): + if self._is_fp16_op(op.desc.original_id()) or op.type == "cast": for in_name in op.input_names: if _keep_fp32_input(op, in_name): continue @@ -255,7 +252,7 @@ def resolute_tensor_dtype(self, block): self.set_var_to_fp16(out_var_name, block) set_op_dtype_to_fp16(op) # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python - elif self._is_fp16_op(op.desc.original_id()) == False: + elif not self._is_fp16_op(op.desc.original_id()): for out_var_name in op.output_arg_names: out_var = block.vars.get(out_var_name) if out_var is None or out_var.type not in _valid_types: @@ -263,7 +260,7 @@ def resolute_tensor_dtype(self, block): if out_var.dtype == core.VarDesc.VarType.FP16: out_var.desc.set_dtype(core.VarDesc.VarType.FP32) elif is_backward_op(op): - if self._is_fp16_op(op.desc.original_id()) == True: + if self._is_fp16_op(op.desc.original_id()): for out_name in op.output_names: if _keep_fp32_output(op, out_name): continue @@ -271,7 +268,7 @@ def resolute_tensor_dtype(self, block): self.set_var_to_fp16(out_var_name, block) set_op_dtype_to_fp16(op) # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python - elif self._is_fp16_op(op.desc.original_id()) == False: + elif not self._is_fp16_op(op.desc.original_id()): for out_var_name in op.output_arg_names: out_var = block.vars.get(out_var_name) if out_var is None or out_var.type not in _valid_types: @@ -290,7 +287,7 @@ def cast_block(self, block): idx += 1 continue elif is_forward_op(op): - if self._is_fp16_op(op.desc.original_id()) == False: + if not self._is_fp16_op(op.desc.original_id()): num_cast_ops = self._insert_forward_cast_ops( op, idx, @@ -299,7 +296,7 @@ def cast_block(self, block): core.VarDesc.VarType.FP32, self.dist_context, ) - elif self._is_fp16_op(op.desc.original_id()) == True: + elif self._is_fp16_op(op.desc.original_id()): num_cast_ops = self._insert_forward_cast_ops( op, idx, @@ -310,7 +307,7 @@ def cast_block(self, block): ) elif is_backward_op(op): if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id: - if self._is_fp16_op(op.desc.original_id()) == False: + if not self._is_fp16_op(op.desc.original_id()): num_cast_ops = self._insert_backward_cast_ops( op, idx, @@ -319,7 +316,7 @@ def cast_block(self, block): core.VarDesc.VarType.FP32, self.dist_context, ) - elif self._is_fp16_op(op.desc.original_id()) == True: + elif self._is_fp16_op(op.desc.original_id()): num_cast_ops = self._insert_backward_cast_ops( op, idx, diff --git a/python/paddle/distributed/passes/ps_server_pass.py b/python/paddle/distributed/passes/ps_server_pass.py index 37e5622ea8e7f..c8f99895a8324 100755 --- a/python/paddle/distributed/passes/ps_server_pass.py +++ b/python/paddle/distributed/passes/ps_server_pass.py @@ -140,7 +140,7 @@ def _get_lr_sheduler_program(self, lr_sheduler, lr_decay_steps): def _apply_single_impl(self, main_program, startup_program, pass_ctx): attrs = pass_ctx._attrs - if hasattr(attrs['origin_main_program'], 'lr_sheduler') == False: + if not hasattr(attrs['origin_main_program'], 'lr_sheduler'): return assert isinstance( diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py index f99d9f316d462..56f73078d54db 100755 --- a/python/paddle/distributed/passes/ps_trainer_pass.py +++ b/python/paddle/distributed/passes/ps_trainer_pass.py @@ -304,7 +304,7 @@ def dag_check_up_and_reorder(program, inputs, outputs): if found: break if found: - if output_indexes[j] == True: + if output_indexes[j]: warnings.warn( "unable to re-arrange dags order to combine distributed embedding ops" ) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 86766d60ae8c0..d341a95b24be7 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -443,7 +443,7 @@ def parse_by_optimizer(self, ctx, context): self.table_num = size self.table_dim = single_dim - if oop.type != 'adam' and adam_d2sum == True: + if oop.type != 'adam' and adam_d2sum: print('optimization algorithm is not adam, set adam_d2sum False') adam_d2sum = False print("adam_d2sum:", adam_d2sum) @@ -703,7 +703,7 @@ def _set(self, table_proto): if ( ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 - or (ctx.is_sparse() == False) + or (not ctx.is_sparse()) ): return table_proto.table_id = ctx.table_id() @@ -810,7 +810,7 @@ def _set(self, table_proto): if ( ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 - or (ctx.is_sparse() == False) + or (not ctx.is_sparse()) ): return table_proto.table_id = ctx.table_id() @@ -845,7 +845,7 @@ def _set(self, table_proto): if ( ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 - or (ctx.is_sparse() == True) + or (ctx.is_sparse()) ): return @@ -1281,7 +1281,7 @@ def sync_strategy_envs(): if not is_test: if ( self.context['ps_mode'] == DistributedMode.GEO - or self.is_heter_ps_mode == True + or self.is_heter_ps_mode ): self._communicator.init_params(dense_map) else: @@ -1298,7 +1298,7 @@ def sync_strategy_envs(): if ( self.context['ps_mode'] == DistributedMode.GEO - or self.is_heter_ps_mode == True + or self.is_heter_ps_mode ): if not self._communicator.is_running(): self._communicator.start() diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index 75182a497213c..53628ad7e5084 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -1744,7 +1744,7 @@ def create_backward_block( ): is_skip = True break - if is_skip == True: + if is_skip: continue block_append_op(program, origin_program, heter_block, op) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index fd99eb04d99fc..9204bc7f1235d 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -237,7 +237,7 @@ def main(use_cuda, parallel, nn_type, combine): if not use_cuda and not parallel: save_dirname = "recognize_digits_" + nn_type + ".inference.model" save_full_dirname = "recognize_digits_" + nn_type + ".train.model" - if combine == True: + if combine: model_filename = "__model_combined__" params_filename = "__params_combined__" diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py index 3c8b71e71394e..637f1ba844be4 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py @@ -144,7 +144,7 @@ def test_to_static(self): # inputs = InputSpec([batch_size, hidden_size], 'float32', 'x') # labels = InputSpec([batch_size], 'int64', 'label') - assert _non_static_mode() == True + assert _non_static_mode() engine = auto.Engine( model=mlp, loss=loss, @@ -155,7 +155,7 @@ def test_to_static(self): engine.fit(dataset, batch_size=batch_size) engine.evaluate(dataset, batch_size=batch_size) engine.predict(dataset, batch_size=batch_size) - assert _non_static_mode() == False + assert not _non_static_mode() class TestLazyInit(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py index 81f1bdede6f41..37af28458078e 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py @@ -593,7 +593,7 @@ def func(x): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False hessian = paddle.incubate.autograd.Hessian(func, self.x) - assert hessian[:].stop_gradient == False + assert not hessian[:].stop_gradient np.testing.assert_allclose( hessian[:].numpy(), numerical_hessian, self.rtol, self.atol ) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py index 57a199c133395..bc725e9d13801 100755 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py @@ -116,9 +116,7 @@ def test_sharding_amp_optimizer(self): self.optimizer(avg_cost, strategy, train_prog, startup_prog) ops = [op.type for op in avg_cost.block.ops] vars = [x.name for x in train_prog.list_vars()] - parameters = [ - x.name for x in train_prog.list_vars() if x.persistable == True - ] + parameters = [x.name for x in train_prog.list_vars() if x.persistable] self.assertIn('@BroadCast', ''.join(vars)) self.assertIn('cast', ops) self.assertIn('check_finite_and_unscale', ops) @@ -227,9 +225,7 @@ def test_sharding_recompute_optimizer(self): ops = [op.type for op in avg_cost.block.ops] vars = [x.name for x in train_prog.list_vars()] - parameters = [ - x.name for x in train_prog.list_vars() if x.persistable == True - ] + parameters = [x.name for x in train_prog.list_vars() if x.persistable] self.assertIn('@BroadCast', ''.join(vars)) self.assertIn('subprog', ''.join(vars)) @@ -316,9 +312,7 @@ def test_sharding_amp_recompute_optimizer(self): ops = [op.type for op in avg_cost.block.ops] vars = [x.name for x in train_prog.list_vars()] - parameters = [ - x.name for x in train_prog.list_vars() if x.persistable == True - ] + parameters = [x.name for x in train_prog.list_vars() if x.persistable] self.assertIn('@BroadCast', ''.join(vars)) self.assertIn('subprog', ''.join(vars)) @@ -445,9 +439,7 @@ def test_sharding_amp_asp_optimizer(self): ops = [op.type for op in avg_cost.block.ops] vars = [x.name for x in train_prog.list_vars()] - parameters = [ - x.name for x in train_prog.list_vars() if x.persistable == True - ] + parameters = [x.name for x in train_prog.list_vars() if x.persistable] self.assertIn('@BroadCast', ''.join(vars)) self.assertIn('cast', ops) @@ -564,9 +556,7 @@ def test_sharding_weight_decay(self): startup_prog, regularization=regularization, ) - parameters = [ - x.name for x in train_prog.list_vars() if x.persistable == True - ] + parameters = [x.name for x in train_prog.list_vars() if x.persistable] ops = [op.type for op in avg_cost.block.ops] vars = [x.name for x in train_prog.list_vars()] self.assertIn('@BroadCast', ''.join(vars)) @@ -653,9 +643,7 @@ def test_sharding_gradient_clip(self): self.optimizer( avg_cost, strategy, train_prog, startup_prog, grad_clip=clip ) - parameters = [ - x.name for x in train_prog.list_vars() if x.persistable == True - ] + parameters = [x.name for x in train_prog.list_vars() if x.persistable] ops = [op.type for op in avg_cost.block.ops] vars = [x.name for x in train_prog.list_vars()] self.assertIn('@BroadCast', ''.join(vars)) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py index 916a21359a4f8..514009577cd11 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py @@ -420,13 +420,13 @@ def test_get_and_set(self): decr_every_n_nan_or_inf=2, use_dynamic_loss_scaling=True, ) - self.assertEqual(scaler.is_enable() == True, True) + self.assertEqual(scaler.is_enable(), True) self.assertEqual(scaler.get_init_loss_scaling() == 1024, True) self.assertEqual(scaler.get_incr_ratio() == 2.0, True) self.assertEqual(scaler.get_decr_ratio() == 0.5, True) self.assertEqual(scaler.get_incr_every_n_steps() == 1000, True) self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 2, True) - self.assertEqual(scaler.is_use_dynamic_loss_scaling() == True, True) + self.assertEqual(scaler.is_use_dynamic_loss_scaling(), True) scaler.set_decr_every_n_nan_or_inf(4) self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 4, True) scaler.set_decr_ratio(0.1) @@ -460,7 +460,7 @@ def test_state_dict_and_load_state_dict(self): scaler3 = paddle.amp.GradScaler(enable=False) scaler3.load_state_dict(scaler_state) - self.assertEqual(scaler3.is_enable() == False, True) + self.assertFalse(scaler3.is_enable()) def test_state_dict_and_load_state_dict_error(self): def test_error(): diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py index 83c9462a89e9d..1eec439f792d9 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py @@ -419,13 +419,13 @@ def test_get_and_set(self): decr_every_n_nan_or_inf=2, use_dynamic_loss_scaling=True, ) - self.assertEqual(scaler.is_enable() == True, True) + self.assertEqual(scaler.is_enable(), True) self.assertEqual(scaler.get_init_loss_scaling() == 1024, True) self.assertEqual(scaler.get_incr_ratio() == 2.0, True) self.assertEqual(scaler.get_decr_ratio() == 0.5, True) self.assertEqual(scaler.get_incr_every_n_steps() == 1000, True) self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 2, True) - self.assertEqual(scaler.is_use_dynamic_loss_scaling() == True, True) + self.assertEqual(scaler.is_use_dynamic_loss_scaling(), True) scaler.set_decr_every_n_nan_or_inf(4) self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 4, True) scaler.set_decr_ratio(0.1) @@ -459,7 +459,7 @@ def test_state_dict_and_load_state_dict(self): scaler3 = paddle.amp.GradScaler(enable=False) scaler3.load_state_dict(scaler_state) - self.assertEqual(scaler3.is_enable() == False, True) + self.assertFalse(scaler3.is_enable()) def test_state_dict_and_load_state_dict_error(self): def test_error(): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py index 03db89350795c..b484a88b7df31 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py @@ -356,7 +356,7 @@ def __init__( ): super(conv2d, self).__init__() - if use_bias == False: + if not use_bias: con_bias_attr = False else: con_bias_attr = fluid.ParamAttr( @@ -426,7 +426,7 @@ def __init__( ): super(DeConv2D, self).__init__() - if use_bias == False: + if not use_bias: de_bias_attr = False else: de_bias_attr = fluid.ParamAttr( diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py index ecda3427e7ea2..67ea0a28bc0dd 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py @@ -93,7 +93,7 @@ def _run(self, to_static): prog_trans = paddle.jit.ProgramTranslator() prog_trans.enable(to_static) x = paddle.ones([1, 2, 3]) - if to_static == False: + if not to_static: return tensor_size(x) return tensor_size(x).numpy() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py index e16fd8b10c2f8..09171d64a28f3 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py @@ -134,7 +134,7 @@ def generate_bn_Var(): data_layout=data_format, is_test=True, ) - if has_bias == True: + if has_bias: conv2d_op.inputs["Bias"] = ["conv2d_bias"] ops = [conv2d_op, bn_op] @@ -156,7 +156,7 @@ def generate_bn_Var(): }, outputs=["batch_norm_Y"], ) - if has_bias == True: + if has_bias: program_config.weights["conv2d_bias"] = TensorConfig( data_gen=partial(generate_conv2d_Bias) ) @@ -202,7 +202,7 @@ def teller1(program_config, predictor_config): def teller2(program_config, predictor_config): return ( predictor_config.mkldnn_enabled() - and program_config.ops[0].attrs['has_bias'] == True + and program_config.ops[0].attrs['has_bias'] ) self.add_ignore_check_case( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py index 3cbd48dea6d4e..883b4a75bc224 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py @@ -43,11 +43,11 @@ class TestEmbeddingEltwiseLayerNormFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: # is_sparse is only support False - if program_config.ops[0].attrs['is_sparse'] == True: + if program_config.ops[0].attrs['is_sparse']: return False # is_distributed only support False - if program_config.ops[0].attrs['is_distributed'] == True: + if program_config.ops[0].attrs['is_distributed']: return False # axis only support -1 and the last dim. diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py index ba6179a1ff41f..92881bd8d8200 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py @@ -100,7 +100,7 @@ def generate_scale_bias(): outputs={"Out": ["affine_channel_ouput"]}, data_layout=data_format, ) - if has_bias == True: + if has_bias: conv2d_op.inputs["Bias"] = ["conv2d_bias"] ops = [conv2d_op, ac_op] @@ -123,7 +123,7 @@ def generate_scale_bias(): }, outputs=["affine_channel_ouput"], ) - if has_bias == True: + if has_bias: program_config.weights["conv2d_bias"] = TensorConfig( data_gen=partial(generate_bias) ) @@ -145,7 +145,7 @@ def teller1(program_config, predictor_config): def teller2(program_config, predictor_config): return ( predictor_config.mkldnn_enabled() - and program_config.ops[0].attrs['has_bias'] == True + and program_config.ops[0].attrs['has_bias'] ) self.add_ignore_check_case( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py index 46a7b82ef4d78..6c3c47687751a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py @@ -304,7 +304,7 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: + if dynamic_shape: return 1, 4 else: if attrs[0]['axis'] != 0: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py index 91b8380d7d612..ee8f900f5126b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py @@ -123,7 +123,7 @@ def clear_dynamic_shape(): def generate_trt_nodes_num(attrs, dynamic_shape): if attrs[0]['dropout_implementation'] == "upscale_in_train": return 0, 2 - elif self.dims == 1 and dynamic_shape == False: + elif self.dims == 1 and not dynamic_shape: return 0, 3 else: return 1, 2 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py index ca2984fa18777..fab7428579a55 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py @@ -85,7 +85,7 @@ def generate_input3(axis): "index_data": TensorConfig( data_gen=partial( generate_input2 - if index_type_int32 == True + if index_type_int32 else generate_input4, index, ) @@ -180,7 +180,7 @@ def generate_trt_nodes_num(dynamic_shape): if self.input_num == 3: return 0, 5 else: - if dynamic_shape and self.index_type_int32 == True: + if dynamic_shape and self.index_type_int32: return 1, 3 else: return 0, 4 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py index d6d2f876361e5..29962386a48dc 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py @@ -107,7 +107,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): if compile_version >= valid_version: return 1, 2 else: - if attrs[0]['approximate'] == True: + if attrs[0]['approximate']: return 0, 3 else: return 1, 2 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py index 1df42024992cc..8f39add1493ce 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp.py @@ -137,7 +137,7 @@ def teller1(program_config, predictor_config): and self.dynamic_shape.min_input_shape ): return True - if program_config.ops[0].attrs['align_corners'] == True: + if program_config.ops[0].attrs['align_corners']: return True return False diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py index b4eacbb136f06..7bdaab0ee841c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py @@ -29,7 +29,7 @@ def is_paddings_valid(self, program_config: ProgramConfig) -> bool: ksize = program_config.ops[0].attrs['ksize'] pooling_type = program_config.ops[0].attrs['pooling_type'] global_pooling = program_config.ops[0].attrs['global_pooling'] - if global_pooling == False: + if not global_pooling: if pooling_type == 'avg': for index in range(len(ksize)): if ksize[index] <= paddings[index]: @@ -174,10 +174,10 @@ def add_skip_trt_case(self): def teller(program_config, predictor_config): if ( program_config.ops[0].attrs['pooling_type'] == 'avg' - and program_config.ops[0].attrs['global_pooling'] == False - and program_config.ops[0].attrs['exclusive'] == True - and program_config.ops[0].attrs['adaptive'] == False - and program_config.ops[0].attrs['ceil_mode'] == True + and not program_config.ops[0].attrs['global_pooling'] + and program_config.ops[0].attrs['exclusive'] + and not program_config.ops[0].attrs['adaptive'] + and program_config.ops[0].attrs['ceil_mode'] ): return True return False diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py index f89527359d4d1..0dc286722d1ae 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py @@ -159,10 +159,10 @@ def clear_dynamic_shape(): def generate_trt_nodes_num(attrs, dynamic_shape): if self.num_input == 0: - if dynamic_shape == True: + if dynamic_shape: return 0, 5 elif self.num_input == 1: - if dynamic_shape == True: + if dynamic_shape: return 1, 3 else: return 0, 4 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py index 04c1e3259fc12..9c4c9071e37c9 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py @@ -77,7 +77,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): ver = paddle_infer.get_trt_compile_version() if ( ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8000 - and dynamic_shape == True + and dynamic_shape ): return 0, 3 else: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py index 18ea2abe6bc35..5e2e984fc7bd9 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py @@ -192,7 +192,7 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: + if dynamic_shape: return 1, 3 else: return 0, 4 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py index 91e5e499b19b4..c891f236f2fce 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py @@ -181,7 +181,7 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: + if dynamic_shape: return 1, 4 else: return 0, 5 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py index d65d7e3c29f96..adcb5c5e4b90b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py @@ -81,7 +81,7 @@ def clear_dynamic_shape(): def generate_trt_nodes_num(attrs, dynamic_shape): ver = paddle_infer.get_trt_compile_version() if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000: - if dynamic_shape == True: + if dynamic_shape: return 0, 3 else: return 1, 2 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py index 477ea649effd3..a5575eae55490 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py @@ -126,7 +126,7 @@ def clear_dynamic_shape(): def generate_trt_nodes_num(attrs, dynamic_shape): if self.dims == 1: return 0, 4 - if self.sort == False: + if not self.sort: return 0, 4 return 1, 3 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py index 9ebcd87399230..0a987ca1fb69c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py @@ -123,7 +123,7 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: + if dynamic_shape: return 1, 2 else: if attrs[0]['axis'][0] == 0: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py index a0d089c69c934..7e595a48c9d72 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py @@ -28,7 +28,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]], batch, channel): - if attrs[0]['iou_aware'] == True: + if attrs[0]['iou_aware']: return np.ones([batch, 3 * (channel + 6), 13, 13]).astype( np.float32 ) @@ -108,7 +108,7 @@ def sample_predictor_configs( self, program_config ) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - if attrs[0]['iou_aware'] == True: + if attrs[0]['iou_aware']: channel = 3 * (attrs[0]['class_num'] + 6) self.dynamic_shape.min_input_shape = { "yolo_box_input": [1, channel, 12, 12], diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py index 4001e2ba76ba7..d71e6446c6048 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py @@ -128,7 +128,7 @@ def init_scales(self): def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode self.init_scales() - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) def test_check_grad_normal(self): pass @@ -165,9 +165,7 @@ def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode self.init_scales() int_atol = 1 # different quantization techniques - self.check_output( - check_dygraph=(self.use_mkldnn == False), atol=int_atol - ) + self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol) class TestUint8Scales(TestInt8Scales): diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py index 3ca09093b814d..4881d1c3763e4 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py @@ -101,7 +101,7 @@ def init_scales(self): def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode self.init_scales() - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) def test_check_grad_normal(self): pass @@ -138,9 +138,7 @@ def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode self.init_scales() int_atol = 1 # different quantization techniques - self.check_output( - check_dygraph=(self.use_mkldnn == False), atol=int_atol - ) + self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol) class TestUint8Scales(TestInt8Scales): diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py index 1b27a39f2e956..218900b35f4f3 100755 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py @@ -49,7 +49,7 @@ def resize_short(img, target_size): def crop_image(img, target_size, center): width, height = img.size size = target_size - if center == True: + if center: w_start = (width - size) / 2 h_start = (height - size) / 2 else: diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 6147c88dc5680..baa4f26feb81c 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -371,25 +371,22 @@ def is_empty_grad_op(op_type): return True def is_xpu_op_test(): - return hasattr(cls, "use_xpu") and cls.use_xpu == True + return hasattr(cls, "use_xpu") and cls.use_xpu def is_mkldnn_op_test(): - return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True + return hasattr(cls, "use_mkldnn") and cls.use_mkldnn def is_rocm_op_test(): return core.is_compiled_with_rocm() def is_npu_op_test(): - return hasattr(cls, "use_npu") and cls.use_npu == True + return hasattr(cls, "use_npu") and cls.use_npu def is_mlu_op_test(): - return hasattr(cls, "use_mlu") and cls.use_mlu == True + return hasattr(cls, "use_mlu") and cls.use_mlu def is_custom_device_op_test(): - return ( - hasattr(cls, "use_custom_device") - and cls.use_custom_device == True - ) + return hasattr(cls, "use_custom_device") and cls.use_custom_device if not hasattr(cls, "op_type"): raise AssertionError( @@ -465,17 +462,17 @@ def is_bfloat16_op(self): ) def is_mkldnn_op(self): - return (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or ( + return (hasattr(self, "use_mkldnn") and self.use_mkldnn) or ( hasattr(self, "attrs") and "use_mkldnn" in self.attrs - and self.attrs["use_mkldnn"] == True + and self.attrs["use_mkldnn"] ) def is_xpu_op(self): - return (hasattr(self, "use_xpu") and self.use_xpu == True) or ( + return (hasattr(self, "use_xpu") and self.use_xpu) or ( hasattr(self, "attrs") and "use_xpu" in self.attrs - and self.attrs["use_xpu"] == True + and self.attrs["use_xpu"] ) # set the self.output_dtype . @@ -1542,7 +1539,7 @@ def check_output_with_place( ): # disable legacy dygraph check when check_eager is True - if check_eager == True: + if check_eager: check_dygraph = False def find_imperative_actual(target_name, dygraph_outs, place): @@ -1912,7 +1909,7 @@ def _is_skip_name(self, name): ) if check_eager: - assert check_dygraph == False + assert not check_dygraph return outs, eager_dygraph_outs, fetch_list elif check_dygraph: return outs, dygraph_outs, fetch_list @@ -2002,7 +1999,7 @@ def check_output( ): # disable legacy dygraph check when check_eager is True - if check_eager == True: + if check_eager: check_dygraph = False self.__class__.op_type = self.op_type @@ -2024,7 +2021,7 @@ def check_output( check_eager=check_eager, ) if check_eager: - assert check_dygraph == False + assert not check_dygraph outs, eager_dygraph_outs, fetch_list = res elif check_dygraph: outs, dygraph_outs, fetch_list = res @@ -2143,7 +2140,7 @@ def check_grad( ): # disable legacy dygraph check when check_eager is True - if check_eager == True: + if check_eager: check_dygraph = False self._check_grad_helper() @@ -2180,7 +2177,7 @@ def check_grad_with_place( ): # disable legacy dygraph check when check_eager is True - if check_eager == True: + if check_eager: check_dygraph = False self.scope = core.Scope() @@ -2207,7 +2204,7 @@ def check_grad_with_place( # oneDNN numeric gradient should use CPU kernel use_onednn = False - if "use_mkldnn" in op_attrs and op_attrs["use_mkldnn"] == True: + if "use_mkldnn" in op_attrs and op_attrs["use_mkldnn"]: op_attrs["use_mkldnn"] = False use_onednn = True diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 220bd09f2cafa..295f6a67d8d62 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -51,7 +51,7 @@ def is_empty_grad_op(op_type): if cls.dtype == np.float16: place = paddle.XPUPlace(0) - if core.is_float16_supported(place) == False: + if not core.is_float16_supported(place): return if cls.dtype == np.float64: @@ -98,7 +98,7 @@ def check_output_with_place( return if self.dtype == np.float16: - if core.is_float16_supported(place) == False: + if not core.is_float16_supported(place): return if self.dtype == np.float16: @@ -172,7 +172,7 @@ def check_grad_with_place( return if self.dtype == np.float16: - if core.is_float16_supported(place) == False: + if not core.is_float16_supported(place): return if self.dtype == np.float16: @@ -254,7 +254,7 @@ def get_grad_with_place( # oneDNN numeric gradient should use CPU kernel use_onednn = False - if "use_mkldnn" in op_attrs and op_attrs["use_mkldnn"] == True: + if "use_mkldnn" in op_attrs and op_attrs["use_mkldnn"]: op_attrs["use_mkldnn"] = False use_onednn = True diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py index 08055ae170393..ccaed0b984fed 100755 --- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py +++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py @@ -167,7 +167,7 @@ def get_user_defined_strategy(config): strategy.is_fl_ps_mode = ( True if config.get("runner.is_fl_ps_mode") == 1 else False ) - if strategy.is_fl_ps_mode == True: + if strategy.is_fl_ps_mode: strategy.pipeline = False micro_num = 1 strategy.pipeline_configs = { diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 937d30cd74aac..9904ee0d100a3 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -1126,11 +1126,11 @@ def _adam_optimize_dygraph( ) for idx in range(2): - if place == 'gpu' and use_amp == True: + if place == 'gpu' and use_amp: model = paddle.amp.decorate(models=model, level='O2') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - if place == 'gpu' and use_amp == True: + if place == 'gpu' and use_amp: with paddle.amp.auto_cast(level='O2'): output = model(input) loss = paddle.mean(output) diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index 6e4a7b43f20bf..15c8bf69bc01b 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -302,11 +302,11 @@ def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False): ) for idx in range(2): - if place == 'gpu' and use_amp == True: + if place == 'gpu' and use_amp: model = paddle.amp.decorate(models=model, level='O2') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - if place == 'gpu' and use_amp == True: + if place == 'gpu' and use_amp: with paddle.amp.auto_cast(level='O2'): output = model(input) loss = paddle.mean(output) diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index c56235cd0d129..41fc17187093c 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -177,8 +177,8 @@ def train(use_cuda, thread_num, cpu_num): fetch_list=[array, acc, prediction, avg_loss.name] ) - assert numpy.allclose(array_v[0], prediction_v) == True - assert numpy.allclose(array_v[1], acc_v) == True + assert numpy.allclose(array_v[0], prediction_v) + assert numpy.allclose(array_v[1], acc_v) loss_val = numpy.mean(loss_val) if step % 10 == 0: diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index 381640621cb1c..ccd7de2c3171f 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -313,7 +313,7 @@ def check_with_place(self, place, data_layout, dtype, shape): # dims will be in NCHW order as it is MKL-DNN way # of memory descripting. So we need to convert NCHW # dims into NHWC. - if data_layout == "NHWC" and self.use_mkldnn == True: + if data_layout == "NHWC" and self.use_mkldnn: # Create executor to have MKL-DNN cache # cleared after NHWC unit test place = core.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index 778489eb668dc..f5db751169a1f 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -391,7 +391,7 @@ def test_global_stats(self): ) net2.weight = net1.weight net2.bias = net1.bias - if self.trainable_statistics == True: + if self.trainable_statistics: net1.training = False net2.training = False y1 = net1(x) diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 233a686c05806..fd3106f9c6f84 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -20,8 +20,8 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): - pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) - pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_w = p_box[:, 2] - p_box[:, 0] + (not norm) + pb_h = p_box[:, 3] - p_box[:, 1] + (not norm) pb_x = pb_w * 0.5 + p_box[:, 0] pb_y = pb_h * 0.5 + p_box[:, 1] shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1) @@ -55,8 +55,8 @@ def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): def box_encoder(t_box, p_box, pb_v, output_box, norm): - pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) - pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_w = p_box[:, 2] - p_box[:, 0] + (not norm) + pb_h = p_box[:, 3] - p_box[:, 1] + (not norm) pb_x = pb_w * 0.5 + p_box[:, 0] pb_y = pb_h * 0.5 + p_box[:, 1] shape = (1, p_box.shape[0]) diff --git a/python/paddle/fluid/tests/unittests/test_center_loss.py b/python/paddle/fluid/tests/unittests/test_center_loss.py index b7eda71c0217b..7bf68100e029d 100644 --- a/python/paddle/fluid/tests/unittests/test_center_loss.py +++ b/python/paddle/fluid/tests/unittests/test_center_loss.py @@ -58,7 +58,7 @@ def setUp(self): 'CenterUpdateRate': rate, } - if self.need_update == True: + if self.need_update: self.outputs = { 'SampleCenterDiff': output, 'Loss': loss, diff --git a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py index bc88cba96eb6e..fb8a7057fd7d4 100644 --- a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py @@ -115,7 +115,7 @@ def test_dynamic_api(self): x = paddle.ones(shape=[10, 10], dtype="int32") y = paddle.ones(shape=[10, 10], dtype="int32") out = paddle.equal_all(x, y) - assert out.numpy()[0] == True + assert out.numpy()[0] is np.True_ paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 34a34f062efb8..0c22f7ff7b277 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -477,13 +477,12 @@ def test_check_output(self): place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_output_with_place( - place, atol=1e-5, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-5, check_dygraph=(not self.use_mkldnn) ) def test_check_grad(self): if self.dtype == np.float16 or ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True + hasattr(self, "no_need_check_grad") and self.no_need_check_grad ): return place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace() @@ -493,13 +492,12 @@ def test_check_grad(self): {'Input', 'Filter'}, 'Output', max_relative_error=0.02, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_no_filter(self): if self.dtype == np.float16 or ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True + hasattr(self, "no_need_check_grad") and self.no_need_check_grad ): return place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace() @@ -510,13 +508,12 @@ def test_check_grad_no_filter(self): 'Output', max_relative_error=0.02, no_grad_set=set(['Filter']), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_no_input(self): if self.dtype == np.float16 or ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True + hasattr(self, "no_need_check_grad") and self.no_need_check_grad ): return place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace() @@ -526,7 +523,7 @@ def test_check_grad_no_input(self): ['Filter'], 'Output', no_grad_set=set(['Input']), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def init_test_case(self): @@ -804,7 +801,7 @@ def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode place = core.CUDAPlace(0) if self.has_cuda() else core.CPUPlace() self.check_output_with_place( - place, atol=1e-5, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-5, check_dygraph=(not self.use_mkldnn) ) def test_check_grad(self): @@ -817,7 +814,7 @@ def test_check_grad(self): {'Input', 'Filter'}, 'Output', max_relative_error=0.02, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_no_filter(self): @@ -831,7 +828,7 @@ def test_check_grad_no_filter(self): 'Output', max_relative_error=0.02, no_grad_set=set(['Filter']), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_no_input(self): @@ -844,7 +841,7 @@ def test_check_grad_no_input(self): ['Filter'], 'Output', no_grad_set=set(['Input']), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def init_test_case(self): diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 29ffbd80d3623..482da8164b245 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -183,10 +183,10 @@ def test_check_output(self): if self.use_cudnn: place = core.CUDAPlace(0) self.check_output_with_place( - place, atol=1e-5, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-5, check_dygraph=(not self.use_mkldnn) ) else: - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) def test_check_grad_no_input(self): if self.need_check_grad: @@ -724,10 +724,10 @@ def test_check_output(self): if self.use_cudnn: place = core.CUDAPlace(0) self.check_output_with_place( - place, atol=0.02, check_dygraph=(self.use_mkldnn == False) + place, atol=0.02, check_dygraph=(not self.use_mkldnn) ) else: - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) @unittest.skipIf( diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index eaa6ba04c64e6..54a3621e0ba72 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -327,7 +327,7 @@ def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() self.check_output_with_place( - place, atol=1e-5, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-5, check_dygraph=(not self.use_mkldnn) ) def test_check_grad(self): @@ -340,7 +340,7 @@ def test_check_grad(self): {'Input', 'Filter'}, 'Output', max_relative_error=0.03, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_no_filter(self): @@ -354,7 +354,7 @@ def test_check_grad_no_filter(self): 'Output', max_relative_error=0.03, no_grad_set=set(['Filter']), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_no_input(self): @@ -368,7 +368,7 @@ def test_check_grad_no_input(self): 'Output', max_relative_error=0.03, no_grad_set=set(['Input']), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def init_test_case(self): diff --git a/python/paddle/fluid/tests/unittests/test_dataset_download.py b/python/paddle/fluid/tests/unittests/test_dataset_download.py index f1fba215b931f..b009a2fe58dca 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_download.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_download.py @@ -34,7 +34,7 @@ def test_download_url(self): except Exception as e: catch_exp = True - self.assertTrue(catch_exp == False) + self.assertTrue(not catch_exp) file_path = DATA_HOME + "/flowers/imagelabels.mat" diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 4c109feaef235..6212de9ebcfa4 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -1330,8 +1330,8 @@ def _get_gloo_trainer_cmd( tr_cmd += " --diff_batch" self.__use_cuda = False self.__use_xpu = False - assert self.__use_cuda == False, "gloo not support use cuda" - assert self.__use_xpu == False, "gloo not support use xpu" + assert not self.__use_cuda, "gloo not support use cuda" + assert not self.__use_xpu, "gloo not support use xpu" tr_cmd += " --use_cpu" env.update( { @@ -1345,7 +1345,7 @@ def _get_gloo_trainer_cmd( } ) - assert self._use_dgc == False, "gloo not support use dgc" + assert not self._use_dgc, "gloo not support use dgc" if self._accumulate_gradient: tr_cmd += " --accumulate_gradient" @@ -1353,7 +1353,7 @@ def _get_gloo_trainer_cmd( if self._find_unused_parameters: tr_cmd += " --find_unused_parameters" - assert self._pipeline_mode == False, "gloo not support use pipeline" + assert not self._pipeline_mode, "gloo not support use pipeline" if self._enable_backward_deps: # build strategy, save it tr_cmd += " --enable_backward_deps" @@ -1361,8 +1361,8 @@ def _get_gloo_trainer_cmd( if self._fuse_all_reduce is not None: tr_cmd += " --fuse_all_reduce {}".format(self._fuse_all_reduce) - assert self._use_fleet_api == False, "gloo not support use fleet api" - assert self._use_fleet_api_20 == False, "gloo not support use fleet api" + assert not self._use_fleet_api, "gloo not support use fleet api" + assert not self._use_fleet_api_20, "gloo not support use fleet api" return tr_cmd, env def _get_nccl2_trainer_cmd( diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index ae516bc44cad3..6bfd14dc84152 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -46,12 +46,12 @@ def setUp(self): self.outputs = {'Out': self.out} def check_eager(self): - return self.use_mkldnn == False and self.axis == -1 + return not self.use_mkldnn and self.axis == -1 def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_output( - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), check_eager=self.check_eager(), ) @@ -62,7 +62,7 @@ def test_check_grad_normal(self): self.check_grad( ['X', 'Y'], 'Out', - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), check_eager=self.check_eager(), ) @@ -74,7 +74,7 @@ def test_check_grad_ingore_x(self): ['Y'], 'Out', no_grad_set=set("X"), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), check_eager=self.check_eager(), ) @@ -86,7 +86,7 @@ def test_check_grad_ingore_y(self): ['X'], 'Out', no_grad_set=set('Y'), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), check_eager=self.check_eager(), ) @@ -115,7 +115,7 @@ def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): self.check_output_with_place( - place, atol=1e-3, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-3, check_dygraph=(not self.use_mkldnn) ) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index cc3cd9be8236c..987a17ff1f5ea 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -49,13 +49,11 @@ def setUp(self): def test_check_output(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) def test_check_grad_normal(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_grad( - ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False) - ) + self.check_grad(['X', 'Y'], 'Out', check_dygraph=(not self.use_mkldnn)) def test_check_grad_ingore_x(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode @@ -63,7 +61,7 @@ def test_check_grad_ingore_x(self): ['Y'], 'Out', no_grad_set=set("X"), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_ingore_y(self): @@ -72,7 +70,7 @@ def test_check_grad_ingore_y(self): ['X'], 'Out', no_grad_set=set('Y'), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def init_input_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py index 4ce4ab6a6d527..82ad72e11e5f2 100644 --- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py @@ -47,8 +47,8 @@ def __check_out__(self, out): ) elif data_type in ['bool']: total_num = out.size - true_num = np.sum(out == True) - false_num = np.sum(out == False) + true_num = np.sum(out) + false_num = np.sum(~out) self.assertTrue( total_num == true_num + false_num, 'The value should always be True or False.', diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py index 11b66325c1f5b..7b488aa0c6dda 100644 --- a/python/paddle/fluid/tests/unittests/test_empty_op.py +++ b/python/paddle/fluid/tests/unittests/test_empty_op.py @@ -43,8 +43,8 @@ def verify_output(self, outs): ) elif data_type in ['bool']: total_num = outs[0].size - true_num = np.sum(outs[0] == True) - false_num = np.sum(outs[0] == False) + true_num = np.sum(outs[0]) + false_num = np.sum(~outs[0]) self.assertTrue( total_num == true_num + false_num, 'The value should always be True or False.', @@ -132,8 +132,8 @@ def verify_output(self, outs): ) elif data_type in ['bool']: total_num = outs[0].size - true_num = np.sum(outs[0] == True) - false_num = np.sum(outs[0] == False) + true_num = np.sum(outs[0]) + false_num = np.sum(~outs[0]) self.assertTrue( total_num == true_num + false_num, 'The value should always be True or False.', @@ -182,8 +182,8 @@ def verify_output(self, outs): ) elif data_type in ['bool']: total_num = outs[0].size - true_num = np.sum(outs[0] == True) - false_num = np.sum(outs[0] == False) + true_num = np.sum(outs[0]) + false_num = np.sum(~outs[0]) self.assertTrue( total_num == true_num + false_num, 'The value should always be True or False.', diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py index b0dcfd653fb75..dc4ad0cea15da 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py @@ -29,16 +29,16 @@ def func_set_trainable(self): linear = dygraph.Linear(10, 10) y = linear(label) - self.assertTrue(y.stop_gradient == False) + self.assertFalse(y.stop_gradient) linear.weight.trainable = False linear.bias.trainable = False - self.assertTrue(linear.weight.trainable == False) - self.assertTrue(linear.weight.stop_gradient == True) + self.assertFalse(linear.weight.trainable) + self.assertTrue(linear.weight.stop_gradient) y = linear(label) - self.assertTrue(y.stop_gradient == True) + self.assertTrue(y.stop_gradient) with self.assertRaises(ValueError): linear.weight.trainable = "1" diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index 68e88c9ba2a81..0c52d7596c129 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -28,13 +28,13 @@ def mean_wrapper(x, axis=None, keepdim=False, reduce_all=False): - if reduce_all == True: + if reduce_all: return paddle.mean(x, range(len(x.shape)), keepdim) return paddle.mean(x, axis, keepdim) def reduce_mean_wrapper(x, axis=0, keepdim=False, reduce_all=False): - if reduce_all == True: + if reduce_all: return paddle.mean(x, range(len(x.shape)), keepdim) return paddle.mean(x, axis, keepdim) diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index 017b001e259d6..fd9b8b88016bd 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -910,10 +910,10 @@ def _momentum_optimize_dygraph( multi_precision=use_amp, ) for idx in range(5): - if place == 'gpu' and use_amp == True: + if place == 'gpu' and use_amp: model = paddle.amp.decorate(models=model, level='O2') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - if place == 'gpu' and use_amp == True: + if place == 'gpu' and use_amp: with paddle.amp.auto_cast(level='O2'): output = model(input) loss = paddle.mean(output) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index cde6c8daf96be..6cc6fdd4311eb 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -146,12 +146,8 @@ def iou(box_a, box_b, norm): xmax_b = max(box_b[0], box_b[2]) ymax_b = max(box_b[1], box_b[3]) - area_a = (ymax_a - ymin_a + (norm == False)) * ( - xmax_a - xmin_a + (norm == False) - ) - area_b = (ymax_b - ymin_b + (norm == False)) * ( - xmax_b - xmin_b + (norm == False) - ) + area_a = (ymax_a - ymin_a + (not norm)) * (xmax_a - xmin_a + (not norm)) + area_b = (ymax_b - ymin_b + (not norm)) * (xmax_b - xmin_b + (not norm)) if area_a <= 0 and area_b <= 0: return 0.0 @@ -160,9 +156,7 @@ def iou(box_a, box_b, norm): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa + (norm == False), 0.0) * max( - yb - ya + (norm == False), 0.0 - ) + inter_area = max(xb - xa + (not norm), 0.0) * max(yb - ya + (not norm), 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) diff --git a/python/paddle/fluid/tests/unittests/test_ops_nms.py b/python/paddle/fluid/tests/unittests/test_ops_nms.py index 573231a8a725a..be4d5f4921324 100644 --- a/python/paddle/fluid/tests/unittests/test_ops_nms.py +++ b/python/paddle/fluid/tests/unittests/test_ops_nms.py @@ -55,7 +55,7 @@ def multiclass_nms(boxes, scores, category_idxs, iou_threshold, top_k): mask[cur_category_boxes_idxs[cur_category_keep_boxes_sub_idxs]] = True - keep_boxes_idxs = _find(mask == True) + keep_boxes_idxs = _find(mask) topK_sub_indices = np.argsort(-scores[keep_boxes_idxs])[:top_k] return keep_boxes_idxs[topK_sub_indices] diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 17ad33f67ab99..ab5d5ac46daf9 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -784,7 +784,7 @@ def net(self, return_input=False, with_dropout=False, with_seed=False): type="mean", inputs={"X": b2_out}, outputs={"Out": mean_out} ) - if return_input == True: + if return_input: return mul_x, mul_out, b1_out, b2_out, mean_out return mul_out, b1_out, b2_out, mean_out diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py index 9c8d342993739..e0e545448b5b3 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py @@ -58,8 +58,8 @@ def check_drop_scope(self, use_cuda=True): train_exe.run(feed={"X": x}, fetch_list=[loss.name]) test_exe.run(feed={"X": x}, fetch_list=[loss.name]) - assert train_exe._need_create_local_exe_scopes() == False - assert test_exe._need_create_local_exe_scopes() == False + assert not train_exe._need_create_local_exe_scopes() + assert not test_exe._need_create_local_exe_scopes() # drop the local execution scope immediately train_exe.drop_local_exe_scopes() diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 7c44827262d6c..b2ae6318cc5de 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -181,7 +181,7 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride): if padding_algorithm == "VALID": paddings = [0, 0, 0, 0] - if ceil_mode != False: + if ceil_mode is not False: raise ValueError( "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)" " must be False. " @@ -346,10 +346,10 @@ def test_check_output(self): if self.has_cudnn(): place = core.CUDAPlace(0) self.check_output_with_place( - place, atol=1e-5, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-5, check_dygraph=(not self.use_mkldnn) ) else: - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) def test_check_grad(self): if self.dtype == np.float16: @@ -362,14 +362,14 @@ def test_check_grad(self): set(['X']), 'Out', max_relative_error=0.07, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) elif self.pool_type != "max": self.check_grad( set(['X']), 'Out', max_relative_error=0.07, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def init_data_format(self): @@ -512,7 +512,7 @@ def test_check_output(self): self.check_output_with_place( place, atol=1e-3, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad(self): @@ -528,7 +528,7 @@ def test_check_grad(self): set(['X']), 'Out', max_relative_error=0.07, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op") @@ -553,7 +553,7 @@ def test_check_output(self): self.check_output_with_place( place, atol=1e-3, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad(self): @@ -569,7 +569,7 @@ def test_check_grad(self): set(['X']), 'Out', max_relative_error=0.07, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py index a5bf506695648..09222e99c3622 100644 --- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py @@ -68,7 +68,7 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride): if padding_algorithm == "VALID": paddings = [0, 0, 0, 0, 0, 0] - if ceil_mode != False: + if ceil_mode is not False: raise ValueError( "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)" " must be False. " diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index 34420ce5a9c56..255a479998496 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -321,12 +321,12 @@ def dygraph_sgd_mp(self, mp): optimizer = paddle.optimizer.SGD( parameters=model.parameters(), multi_precision=mp ) - if mp == True: + if mp: model = paddle.amp.decorate(models=model, level='O2') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) for idx in range(5): - if mp == True: + if mp: with paddle.amp.auto_cast(level='O2'): output = model(input) loss = paddle.mean(output) @@ -429,12 +429,12 @@ def dygraph_sgd_mp(self, mp): parameter_list=model.parameters(), multi_precision=mp, ) - if mp == True: + if mp: model = paddle.amp.decorate(models=model, level='O2') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) for idx in range(5): - if mp == True: + if mp: with paddle.amp.auto_cast(level='O2'): output = model(input) loss = paddle.mean(output) diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index c83f569cb11a1..18a5737225fa9 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -78,10 +78,10 @@ def test_check_output(self): if self.use_cudnn: place = core.CUDAPlace(0) self.check_output_with_place( - place, atol=1e-5, check_dygraph=(self.use_mkldnn == False) + place, atol=1e-5, check_dygraph=(not self.use_mkldnn) ) else: - self.check_output(check_dygraph=(self.use_mkldnn == False)) + self.check_output(check_dygraph=(not self.use_mkldnn)) def test_check_grad(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode @@ -93,14 +93,14 @@ def test_check_grad(self): ["X"], "Out", max_relative_error=0.01, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) else: self.check_grad( ["X"], "Out", max_relative_error=0.01, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) @@ -389,9 +389,7 @@ def init_cudnn(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place( - place, check_dygraph=(self.use_mkldnn == False) - ) + self.check_output_with_place(place, check_dygraph=(not self.use_mkldnn)) def test_check_grad(self): place = core.CUDAPlace(0) @@ -400,7 +398,7 @@ def test_check_grad(self): ["X"], "Out", numeric_grad_delta=0.05, - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index fb0e46b6740b8..a623a311ccf1c 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -131,7 +131,7 @@ def setUp(self): softmax, labels, self.soft_label, self.axis, self.ignore_index ) - if self.use_softmax == False: + if not self.use_softmax: self.inputs = {"Logits": softmax, "Label": labels} else: self.inputs = {"Logits": logits, "Label": labels} diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py index 3b29d335da42b..92e2d0200c80d 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -221,7 +221,7 @@ def setUp(self): self.key_padding_mask = key_padding_mask.astype(self.dtype) self.attn_mask = attn_mask.astype(self.dtype) - if self.use_mask == True: + if self.use_mask: result, result_sdd, result_softmax = ref_batch_sparse_attention( self.q, self.k, @@ -236,7 +236,7 @@ def setUp(self): self.q, self.k, self.v, self.offset, self.columns ) - if self.use_mask == True: + if self.use_mask: self.inputs = { 'Q': self.q, 'K': self.k, @@ -326,7 +326,7 @@ def test_static_graph(self): ) key_padding_mask_shape = (self.shape[0], self.shape[2]) attn_mask_shape = (self.shape[2], self.shape[2]) - if self.use_mask == True: + if self.use_mask: key_padding_mask = paddle.static.data( name="KeyPaddingMask", shape=key_padding_mask_shape, @@ -367,7 +367,7 @@ def test_static_graph(self): attn_mask_np = attn_mask_np.astype(self.dtype) exe = fluid.Executor(self.place) - if self.use_mask == True: + if self.use_mask: fetches_result = exe.run( feed={ "Q": Q_np, @@ -436,7 +436,7 @@ def test_dygraph(self): paddle_kp_mask = paddle.to_tensor(key_padding_mask, place=self.place) paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place) - if self.use_mask == True: + if self.use_mask: paddle_result = F.sparse_attention( paddle_query, paddle_key, diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index d1b820bd74c1d..38e65744a8110 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -1147,10 +1147,10 @@ def func_test_if(self): if var2: var2_bool = True - assert var1_bool == False, "if var1 should be false" - assert var2_bool == True, "if var2 should be true" - assert bool(var1) == False, "bool(var1) is False" - assert bool(var2) == True, "bool(var2) is True" + assert not var1_bool, "if var1 should be false" + assert var2_bool, "if var2 should be true" + assert not bool(var1), "bool(var1) is False" + assert bool(var2), "bool(var2) is True" def test_if(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py index 9ae7d9a48331b..7420753d2d359 100644 --- a/python/paddle/fluid/tests/unittests/test_where_op.py +++ b/python/paddle/fluid/tests/unittests/test_where_op.py @@ -68,10 +68,10 @@ def init_data(self): self.out = np.where(self.cond, self.x, self.y) def ref_x_backward(self, dout): - return np.where((self.cond == True), dout, 0) + return np.where(self.cond, dout, 0) def ref_y_backward(self, dout): - return np.where((self.cond == False), dout, 0) + return np.where(~self.cond, dout, 0) def test_api(self, use_cuda=False): for x_stop_gradient in [False, True]: diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py index 7d818cc02c868..c6c7d9f34d8a0 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py @@ -377,7 +377,7 @@ def test_global_stats(self): ) net2.weight = net1.weight net2.bias = net1.bias - if self.trainable_statistics == True: + if self.trainable_statistics: net1.training = False net2.training = False y1 = net1(x) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py index 973e2908c4ecc..a7036f521817a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py @@ -261,10 +261,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True - ): + if hasattr(self, "no_need_check_grad") and self.no_need_check_grad: return if core.is_compiled_with_xpu(): paddle.enable_static() @@ -273,10 +270,7 @@ def test_check_grad(self): ) def test_check_grad_no_filter(self): - if ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True - ): + if hasattr(self, "no_need_check_grad") and self.no_need_check_grad: return if core.is_compiled_with_xpu(): paddle.enable_static() @@ -285,10 +279,7 @@ def test_check_grad_no_filter(self): ) def test_check_grad_no_input(self): - if ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True - ): + if hasattr(self, "no_need_check_grad") and self.no_need_check_grad: return if core.is_compiled_with_xpu(): paddle.enable_static() @@ -433,10 +424,7 @@ def test_check_output(self): def test_check_grad(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode - if ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True - ): + if hasattr(self, "no_need_check_grad") and self.no_need_check_grad: return if core.is_compiled_with_xpu(): paddle.enable_static() @@ -446,10 +434,7 @@ def test_check_grad(self): def test_check_grad_no_filter(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode - if ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True - ): + if hasattr(self, "no_need_check_grad") and self.no_need_check_grad: return if core.is_compiled_with_xpu(): paddle.enable_static() @@ -459,10 +444,7 @@ def test_check_grad_no_filter(self): def test_check_grad_no_input(self): # TODO(wangzhongpu): support mkldnn op in dygraph mode - if ( - hasattr(self, "no_need_check_grad") - and self.no_need_check_grad == True - ): + if hasattr(self, "no_need_check_grad") and self.no_need_check_grad: return if core.is_compiled_with_xpu(): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py index 794ff490d7ec9..36434ce202025 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py @@ -52,7 +52,7 @@ def setUp(self): } out = self.inputs['X'] * (1.0 - self.dropout_prob) - if self.is_test == False: + if not self.is_test: mask = None if self.dropout_prob == 0.0: mask = np.ones(self.shape).astype(self.dtype) @@ -78,7 +78,7 @@ def test_check_output(self): def test_check_grad_normal(self): if ( hasattr(self.__class__, "no_need_check_grad") - and self.__class__.no_need_check_grad == True + and self.__class__.no_need_check_grad ): return diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py index 22ee95c07d4ce..1d9c8c80f5ae5 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py @@ -61,7 +61,7 @@ def test_check_grad_normal(self): place, ['X', 'Y'], 'Out', - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_ingore_x(self): @@ -72,7 +72,7 @@ def test_check_grad_ingore_x(self): ['Y'], 'Out', no_grad_set=set("X"), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def test_check_grad_ingore_y(self): @@ -83,7 +83,7 @@ def test_check_grad_ingore_y(self): ['X'], 'Out', no_grad_set=set('Y'), - check_dygraph=(self.use_mkldnn == False), + check_dygraph=(not self.use_mkldnn), ) def init_input_output(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py index cb56e9b51f42d..f11740d74d482 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py @@ -72,8 +72,8 @@ def verify_output(self, outs): ) elif data_type in ['bool']: total_num = outs[0].size - true_num = np.sum(outs[0] == True) - false_num = np.sum(outs[0] == False) + true_num = np.sum(outs[0]) + false_num = np.sum(~outs[0]) self.assertTrue( total_num == true_num + false_num, 'The value should always be True or False.', diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py index c37d1bff5dd96..7c2a5ed2f0923 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py @@ -106,14 +106,14 @@ def setUp(self): - 0.5, } - if self.trans_x == True: + if self.trans_x: numpy_input_x = ( self.inputs['X'].reshape((self.x_shape[0], -1)).T ) else: numpy_input_x = self.inputs['X'].reshape((-1, self.x_shape[-1])) - if self.trans_y == True: + if self.trans_y: numpy_input_y = self.inputs['Y'].T else: numpy_input_y = self.inputs['Y'] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py index c4aab23a95201..21e46e31783a4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py @@ -106,7 +106,7 @@ def generate_compatible_shapes( shape_Y = [BATCH_SIZE] + shape_Y if dim_Y == 3 and dim_X == 2: - if transpose_X == False: + if not transpose_X: shape_X[1] = shape_X[1] * BATCH_SIZE else: shape_X[0] = shape_X[0] * BATCH_SIZE @@ -326,7 +326,7 @@ def test_check_output(self): def test_check_grad_normal(self): if ( hasattr(self.__class__, "no_need_check_grad") - and self.__class__.no_need_check_grad == True + and self.__class__.no_need_check_grad ): return @@ -338,7 +338,7 @@ def test_check_grad_normal(self): def test_check_grad_ignore_x(self): if ( hasattr(self.__class__, "no_need_check_grad") - and self.__class__.no_need_check_grad == True + and self.__class__.no_need_check_grad ): return @@ -350,7 +350,7 @@ def test_check_grad_ignore_x(self): def test_check_grad_ignore_y(self): if ( hasattr(self.__class__, "no_need_check_grad") - and self.__class__.no_need_check_grad == True + and self.__class__.no_need_check_grad ): return diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py index 63354ac7607ee..3e873a965f6d9 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py @@ -101,7 +101,7 @@ def test_check_output(self): def test_check_grad(self): if ( hasattr(self.__class__, "no_need_check_grad") - and self.__class__.no_need_check_grad == True + and self.__class__.no_need_check_grad ): return place = paddle.XPUPlace(0) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py index 45c9f518cbddd..36cb5dfaefd8b 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py @@ -178,7 +178,7 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride): if padding_algorithm == "VALID": paddings = [0, 0, 0, 0] - if ceil_mode != False: + if ceil_mode is not False: raise ValueError( "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)" " must be False. " diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py index 18af22f3c6465..bd6accf59d1c0 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py @@ -91,10 +91,10 @@ def init_data(self): self.out = np.where(self.cond, self.x, self.y) def ref_x_backward(self, dout): - return np.where(self.cond == True, dout, 0) + return np.where(self.cond, dout, 0) def ref_y_backward(self, dout): - return np.where(self.cond == False, dout, 0) + return np.where(~self.cond, dout, 0) def test_api(self): for x_stop_gradient in [False, True]: diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py index d90c64b76217b..14cb186fce95f 100644 --- a/python/paddle/hapi/model_summary.py +++ b/python/paddle/hapi/model_summary.py @@ -450,7 +450,7 @@ def _get_str_length(summary): total_output += np.sum(np.prod(output_shape, axis=-1)) if "trainable" in summary[layer]: - if summary[layer]["trainable"] == True: + if summary[layer]["trainable"]: trainable_params += summary[layer]["trainable_params"] summary_str += line_new + "\n" diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py index badd8476463de..0532ade86c65f 100644 --- a/python/paddle/incubate/autograd/primrules.py +++ b/python/paddle/incubate/autograd/primrules.py @@ -515,7 +515,7 @@ def dropout_orig2prim(op, seed_t, x): ), 'Can not lower dropout into prim ops with seedtensor.' mask = bernoulli(shape=x.shape, dtype=x.dtype, p=op.attr('dropout_prob')) if op.attr('dropout_implementation') == 'upscale_in_train': - if op.attr('is_test') == False: + if not op.attr('is_test'): out = div( mul(x, mask), fill_const(1.0 - op.attr('dropout_prob'), x.shape, x.dtype), @@ -524,7 +524,7 @@ def dropout_orig2prim(op, seed_t, x): else: return primops.cast(mask, dtype=paddle.uint8), x elif op.attr('dropout_implementation') == 'downgrade_in_infer': - if op.attr('is_test') == False: + if not op.attr('is_test'): return primops.cast(mask, dtype=paddle.uint8), mul(x, mask) else: return primops.cast(mask, dtype=paddle.uint8), mul( diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index b6936c5a90c9b..a61d05761303d 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -2109,7 +2109,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_ #Tensor(shape=[7], dtype=int64, place=CUDAPlace(1), stop_gradient=True, # [0, 1, 2, 3, 5, 7, 8]) """ - if not (group == False or group is None or hasattr(group, 'is_member')): + if not (group is False or group is None or hasattr(group, 'is_member')): raise ValueError( 'Expected group is False, None or instance of paddle.distributed.collective.Group \ (got group: {})'.format( @@ -2124,7 +2124,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_ ring_id = 0 rank = 0 nranks = 1 - if group != False: + if group is not False: if core.is_compiled_with_dist(): parallel_env = paddle.distributed.ParallelEnv() global_rank = parallel_env.rank diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 48cda9d0b4f95..b7e1045b6ee35 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -2033,7 +2033,7 @@ def margin_cross_entropy( """ assert reduction in ['mean', 'sum', 'none', None] - if not (group == False or group is None or hasattr(group, 'is_member')): + if not (group is False or group is None or hasattr(group, 'is_member')): raise ValueError( 'Expected group is False, None or instance of paddle.distributed.collective.Group \ (got group: {})'.format( @@ -2048,7 +2048,7 @@ def margin_cross_entropy( ring_id = 0 rank = 0 nranks = 1 - if group != False: + if group is not False: ring_id = 0 if group is None else group.id if core.is_compiled_with_dist(): parallel_env = paddle.distributed.ParallelEnv() @@ -2537,7 +2537,7 @@ def cross_entropy( "should be 'sum', 'mean' or 'none', but received %s, which is not allowed." % reduction ) - if ignore_index > 0 and soft_label == True: + if ignore_index > 0 and soft_label: raise ValueError( "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy" "should be '-100', but received %s, which is not allowed." @@ -2560,12 +2560,12 @@ def cross_entropy( label = paddle.unsqueeze(label, axis=axis) if in_dygraph_mode(): - if soft_label == False: + if not soft_label: valid_label = ( paddle.cast(label != ignore_index, dtype=label.dtype) * label ) if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): - if soft_label == False: + if not soft_label: _, _, out = _legacy_C_ops.softmax_with_cross_entropy( input, valid_label, @@ -2603,7 +2603,7 @@ def cross_entropy( if weight is not None: # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases. - if soft_label == True: + if soft_label: # chajchaj: # weight's shape is C, where C is class num. # for 1d case: label's shape is [N,C], weight_gather's shape is N. @@ -2710,7 +2710,7 @@ def cross_entropy( return out elif _in_legacy_dygraph(): - if soft_label == False: + if not soft_label: valid_label = ( paddle.cast(label != ignore_index, dtype=label.dtype) * label ) @@ -2725,7 +2725,7 @@ def cross_entropy( "Target {} is out of upper bound.".format(label_max.item()) ) if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): - if soft_label == False: + if not soft_label: _, _, out = _legacy_C_ops.softmax_with_cross_entropy( input, valid_label, @@ -2774,7 +2774,7 @@ def cross_entropy( if weight is not None: # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases. - if soft_label == True: + if soft_label: # chajchaj: # weight's shape is C, where C is class num. # for 1d case: label's shape is [N,C], weight_gather's shape is N. @@ -2921,7 +2921,7 @@ def cross_entropy( weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy' ) weight_name = name if reduction == 'none' else None - if soft_label == True: + if soft_label: # chajchaj: # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases. # weight's shape is C, where C is class num. diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index f9ece56dc7ef5..d81987fa9eedb 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -110,7 +110,7 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False): ) ) if padding == "VALID": - if ceil_mode != False: + if ceil_mode is not False: raise ValueError( "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. " "Received ceil_mode: True." diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py index 344c1d482d421..ef09a1cd5e2b5 100644 --- a/python/paddle/nn/layer/distance.py +++ b/python/paddle/nn/layer/distance.py @@ -76,7 +76,7 @@ def extra_repr(self): main_str = 'p={p}' if self.epsilon != 1e-6: main_str += ', epsilon={epsilon}' - if self.keepdim != False: + if self.keepdim is not False: main_str += ', keepdim={keepdim}' if self.name != None: main_str += ', name={name}' diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 1b5784fbedff1..5f4a4d8d1d8c7 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -71,7 +71,7 @@ def __init__( ): super(_InstanceNormBase, self).__init__() - if weight_attr == False or bias_attr == False: + if weight_attr is False or bias_attr is False: assert ( weight_attr == bias_attr ), "weight_attr and bias_attr must be set to False at the same time in InstanceNorm" @@ -80,7 +80,7 @@ def __init__( self._bias_attr = bias_attr self._num_features = num_features - if weight_attr != False and bias_attr != False: + if weight_attr is not False and bias_attr is not False: self.scale = self.create_parameter( attr=self._weight_attr, shape=[num_features], @@ -382,7 +382,7 @@ def __init__( param_shape = [self._num_channels] - if weight_attr == False: + if weight_attr is False: self.weight = self.create_parameter( attr=None, shape=param_shape, default_initializer=Constant(1.0) ) @@ -398,7 +398,7 @@ def __init__( and self._weight_attr.learning_rate == 0.0 ) - if bias_attr == False: + if bias_attr is False: self.bias = self.create_parameter( attr=None, shape=param_shape, @@ -619,7 +619,7 @@ def __init__( param_shape = [num_features] # create parameter - if weight_attr == False: + if weight_attr is False: self.weight = self.create_parameter( attr=None, shape=param_shape, @@ -639,7 +639,7 @@ def __init__( and self._weight_attr.learning_rate == 0.0 ) - if bias_attr == False: + if bias_attr is False: self.bias = self.create_parameter( attr=None, shape=param_shape, @@ -1315,7 +1315,10 @@ def convert_sync_batchnorm(cls, layer): layer._name, ) - if layer._weight_attr != False and layer._bias_attr != False: + if ( + layer._weight_attr is not False + and layer._bias_attr is not False + ): with no_grad(): layer_output.weight = layer.weight layer_output.bias = layer.bias diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 7e0fa6d7d703d..bfebde2b5eee7 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -964,9 +964,9 @@ def __init__( for direction in range(self.num_directions): suffix = '_reverse' if direction == 1 else '' param_names.extend(['weight_ih_l{}{}', 'weight_hh_l{}{}']) - if bias_ih_attr != False: + if bias_ih_attr is not False: param_names.append('bias_ih_l{}{}') - if bias_hh_attr != False: + if bias_hh_attr is not False: param_names.append('bias_hh_l{}{}') param_names = [x.format(layer, suffix) for x in param_names] for name, param in zip(param_names, self.parameters()): @@ -1187,7 +1187,7 @@ def extra_repr(self): main_str = '{input_size}, {hidden_size}' if self.num_layers != 1: main_str += ', num_layers={num_layers}' - if self.time_major != False: + if self.time_major is not False: main_str += ', time_major={time_major}' if self.dropout != 0: main_str += ', dropout={dropout}' diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index 6eeaed7f86fec..7033df7fb3756 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -298,7 +298,7 @@ def __init__( reduce_type=None, ): assert ( - quant_on_weight == True + quant_on_weight ), "Channel_wise only can be used on weight quantization." super(FakeQuantChannelWiseAbsMax, self).__init__() self._quant_bits = quant_bits diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index c383323d51f98..8db866be45f2d 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -1237,7 +1237,7 @@ def format_ratio(ratio, indent=0): if statistic_data.event_summary.items: all_row_values = [] name_column_width = 52 - if thread_sep == True: + if thread_sep: thread_items = statistic_data.event_summary.thread_items else: thread_items = { @@ -1721,7 +1721,7 @@ def format_ratio(ratio, indent=0): 'ProfileStep' ].general_gpu_time ) - if thread_sep == True: + if thread_sep: userdefined_thread_items = ( statistic_data.event_summary.userdefined_thread_items ) diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index efe3975f14452..4d7b36554b590 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -164,7 +164,7 @@ def load_profiler_result(filename: str): def in_profiler_mode(): - return _is_profiler_used == True + return _is_profiler_used def wrap_optimizers(): @@ -182,7 +182,7 @@ def warpper(*args, **kwargs): return warpper global _has_optimizer_wrapped - if _has_optimizer_wrapped == True: + if _has_optimizer_wrapped: return import paddle.optimizer as optimizer diff --git a/python/paddle/sparse/nn/layer/norm.py b/python/paddle/sparse/nn/layer/norm.py index 936e43a18faf9..117fbf01a1d6a 100644 --- a/python/paddle/sparse/nn/layer/norm.py +++ b/python/paddle/sparse/nn/layer/norm.py @@ -398,7 +398,10 @@ def convert_sync_batchnorm(cls, layer): layer._name, ) - if layer._weight_attr != False and layer._bias_attr != False: + if ( + layer._weight_attr is not False + and layer._bias_attr is not False + ): with no_grad(): layer_output.weight = layer.weight layer_output.bias = layer.bias diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 3ed56a35dfa9c..5348681ad04a7 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -466,9 +466,7 @@ def inf_norm( if in_dygraph_mode(): out = _C_ops.abs(input) reduce_all = ( - True - if axis == None or axis == [] or asvector == True - else False + True if axis == None or axis == [] or asvector else False ) axis = axis if axis != None and axis != [] else [0] if reduce_all: @@ -487,9 +485,7 @@ def inf_norm( dtype=helper.input_dtype() ) - reduce_all = ( - True if axis == None or axis == [] or asvector == True else False - ) + reduce_all = True if axis == None or axis == [] or asvector else False axis = axis if axis != None and axis != [] else [0] reduce_type = ( @@ -1322,7 +1318,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None): avg = nx.sum(axis=1) / w_sum nx_w = nx - if w is not None and aweights is not None and ddof == True: + if w is not None and aweights is not None and ddof: norm_factor = w_sum - (w * aweights).sum() / w_sum else: norm_factor = w_sum - ddof diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 7c629a556b097..3379a60a3bc5e 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -3206,7 +3206,7 @@ def tile(x, repeat_times, name=None): check_variable_and_dtype( x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile' ) - if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False: + if convert_dtype(x.dtype) == 'bool' and not x.stop_gradient: raise ValueError( "When the date type is bool for the input 'x' of tile op, you " "must set its stop_gradient to be True by " @@ -3288,7 +3288,7 @@ def expand_as(x, y, name=None): ) check_type(y, 'y', Variable, 'expand_as') - if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False: + if convert_dtype(x.dtype) == 'bool' and not x.stop_gradient: raise ValueError( "When the data type of input 'x' for expand_as is bool, " "you must set its stop_gradient to be False by " @@ -3359,7 +3359,7 @@ def broadcast_to(x, shape, name=None): x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'broadcast_to' ) check_type(shape, 'shape', (list, tuple, Variable), 'broadcast_to') - if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False: + if convert_dtype(x.dtype) == 'bool' and not x.stop_gradient: raise ValueError( "When the data type of input 'x' for broadcast_to is bool, " "you must set its stop_gradient to be False by " @@ -3457,7 +3457,7 @@ def expand(x, shape, name=None): 'expand', ) check_type(shape, 'shape', (list, tuple, Variable), 'expand') - if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False: + if convert_dtype(x.dtype) == 'bool' and not x.stop_gradient: raise ValueError( "When the data type of input 'x' for expand is bool, " "you must set its stop_gradient to be False by " diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 8791ebb7af268..f5f448cf4ef82 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -188,7 +188,7 @@ def multinomial(x, num_samples=1, replacement=False, name=None): """ assert ( - core.is_compiled_with_rocm() == False + not core.is_compiled_with_rocm() ), "multinomial op is not supported on ROCM yet." if in_dygraph_mode(): diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py index 807c8c3fbebc9..69f23cdaab932 100644 --- a/python/paddle/text/datasets/conll05.py +++ b/python/paddle/text/datasets/conll05.py @@ -228,9 +228,9 @@ def _load_anno(self): lbl_seq = [] verb_word = '' for l in lbl: - if l == '*' and is_in_bracket == False: + if l == '*' and not is_in_bracket: lbl_seq.append('O') - elif l == '*' and is_in_bracket == True: + elif l == '*' and is_in_bracket: lbl_seq.append('I-' + cur_tag) elif l == '*)': lbl_seq.append('I-' + cur_tag) diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py index ee110d6ce7f07..b184ef76fcc54 100644 --- a/tools/analysisPyXml.py +++ b/tools/analysisPyXml.py @@ -46,28 +46,25 @@ def analysisPyXml(rootPath, ut): command = 'sed -n %sp %s' % (line_number, clazz_filename) _code, output = commands.getstatusoutput(command) if _code == 0: - if ( - output.strip().startswith( - ( - 'from', - 'import', - '__all__', - 'def', - 'class', - '"""', - '@', - '\'\'\'', - 'logger', - '_logger', - 'logging', - 'r"""', - 'pass', - 'try', - 'except', - 'if __name__ == "__main__"', - ) + if not output.strip().startswith( + ( + 'from', + 'import', + '__all__', + 'def', + 'class', + '"""', + '@', + '\'\'\'', + 'logger', + '_logger', + 'logging', + 'r"""', + 'pass', + 'try', + 'except', + 'if __name__ == "__main__"', ) - == False ): pattern = r"""(.*) = ('*')|(.*) = ("*")|(.*) = (\d)|(.*) = (-\d)|(.*) = (None)|(.*) = (True)|(.*) = (False)|(.*) = (URL_PREFIX*)|(.*) = (\[)|(.*) = (\{)|(.*) = (\()""" # a='b'/a="b"/a=0 if re.match(pattern, output.strip()) == None: diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py index 8fce508102282..6c4e0fc06b6a5 100644 --- a/tools/check_op_benchmark_result.py +++ b/tools/check_op_benchmark_result.py @@ -40,7 +40,7 @@ def parse_log_file(log_file): for line in f.read().strip().split('\n')[::-1]: try: result = json.loads(line) - if result.get("disabled", False) == True: + if result.get("disabled", False): return None return result except ValueError: diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 0788da6e116ec..fcfa68bb4da48 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -349,7 +349,7 @@ def get_pr_ut(self): file_list.append(filename) else: isWhiteFile = self.get_is_white_file(filename) - if isWhiteFile == False: + if not isWhiteFile: file_list.append(filename) else: filterFiles.append(filename) @@ -417,7 +417,7 @@ def get_pr_ut(self): == tempfilename.split(".")[0] ): f_judge_in_added_ut = True - if f_judge_in_added_ut == True: + if f_judge_in_added_ut: print( "Adding new unit tests not hit mapFiles: %s" % f_judge diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py index 266872feaf4e3..ee5f2d9fd5055 100644 --- a/tools/get_single_test_cov.py +++ b/tools/get_single_test_cov.py @@ -91,7 +91,7 @@ def analysisFNDAFile(rootPath, test): if matchObj == None: OP_REGIST = False break - if OP_REGIST == False: + if not OP_REGIST: related_file_list.append(clazz_filename) os.system( 'echo %s >> %s' % (clazz_filename, related_ut_map_file) diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 313d844d6f6ae..7ece773aa7855 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -122,14 +122,14 @@ def generate_all_ops_inputs_outputs_map(op_descs): outpus = list() for input_ in op_proto[INPUTS]: if ( - op_proto[INPUTS][input_][EXTRA] != True - and op_proto[INPUTS][input_][INTERMEDIATE] != True + not op_proto[INPUTS][input_][EXTRA] + and not op_proto[INPUTS][input_][INTERMEDIATE] ): inputs.append(input_) for output_ in op_proto[OUTPUTS]: if ( - op_proto[OUTPUTS][output_][EXTRA] != True - and op_proto[OUTPUTS][output_][INTERMEDIATE] != True + not op_proto[OUTPUTS][output_][EXTRA] + and not op_proto[OUTPUTS][output_][INTERMEDIATE] ): outpus.append(output_) ops_inputs_map[op_type] = inputs @@ -214,9 +214,9 @@ def get_constraint(op_type, op_proto): optional_input_num_ = 0 for input_ in op_proto[INPUTS]: if ( - op_proto[INPUTS][input_][EXTRA] != True - and op_proto[INPUTS][input_][INTERMEDIATE] != True - and op_proto[INPUTS][input_][DISPENSABLE] == True + not op_proto[INPUTS][input_][EXTRA] + and not op_proto[INPUTS][input_][INTERMEDIATE] + and op_proto[INPUTS][input_][DISPENSABLE] ): optional_input_num_ += 1 if optional_input_num_ > 1: @@ -306,11 +306,11 @@ def convert_op_proto_into_mlir(op_descs): # 2.3.1 inputs for input_ in op_proto[INPUTS]: if ( - op_proto[INPUTS][input_][EXTRA] != True - and op_proto[INPUTS][input_][INTERMEDIATE] != True + not op_proto[INPUTS][input_][EXTRA] + and not op_proto[INPUTS][input_][INTERMEDIATE] ): - if op_proto[INPUTS][input_][DISPENSABLE] != True: - if op_proto[INPUTS][input_][DUPLICABLE] != True: + if not op_proto[INPUTS][input_][DISPENSABLE]: + if not op_proto[INPUTS][input_][DUPLICABLE]: ARGUMENTS = ( ARGUMENTS + " PD_Tensor:$" + input_ + "," ) @@ -319,7 +319,7 @@ def convert_op_proto_into_mlir(op_descs): ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," ) else: - if op_proto[INPUTS][input_][DUPLICABLE] != True: + if not op_proto[INPUTS][input_][DUPLICABLE]: ARGUMENTS = ( ARGUMENTS + " Optional:$" @@ -350,7 +350,7 @@ def convert_op_proto_into_mlir(op_descs): # 2.3.2 attributes for attr in op_proto[ATTRS]: - if (op_proto[ATTRS][attr][EXTRA] == True) or ( + if (op_proto[ATTRS][attr][EXTRA]) or ( attr in skipped_attr_list ): continue @@ -434,10 +434,10 @@ def convert_op_proto_into_mlir(op_descs): outputs = "" for output_ in op_proto[OUTPUTS]: if ( - op_proto[OUTPUTS][output_][EXTRA] != True - and op_proto[OUTPUTS][output_][INTERMEDIATE] != True + not op_proto[OUTPUTS][output_][EXTRA] + and not op_proto[OUTPUTS][output_][INTERMEDIATE] ): - if op_proto[OUTPUTS][output_][DUPLICABLE] != True: + if not op_proto[OUTPUTS][output_][DUPLICABLE]: outputs = outputs + "PD_Tensor:${},".format(output_) else: outputs = outputs + "PD_Tensor_Array:${},".format( diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index 24af09893e355..5afa47dc4fa92 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -376,7 +376,7 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""): # None - no sample code found; # False - it need other special equipment or environment. # so, the following conditional statements are intentionally arranged. - if matched == True: + if matched: tfname = os.path.join( SAMPLECODE_TEMPDIR, '{}_example{}'.format( @@ -395,7 +395,7 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""): ) ) SUMMARY_INFO['skiptest'].append("{}-{}".format(name, cb['id'])) - elif matched == False: + elif not matched: logger.info( '{}\' code block (name:{}, id:{}) required({}) not match capacity({}).'.format( name, From 957fbb02445d1bebe26b3dd5c08064e603152947 Mon Sep 17 00:00:00 2001 From: shentanyue <34421038+shentanyue@users.noreply.github.com> Date: Tue, 1 Nov 2022 18:30:25 +0800 Subject: [PATCH 63/91] fix (#47537) --- paddle/fluid/inference/lite/engine.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 3a60077e9fa0b..c48b718d8878e 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -65,7 +65,9 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, cfg.adaptive_seqlen); lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); - lite_cxx_config.enable_xpu_multi_stream(cfg.enable_multi_stream); + if (cfg.enable_multi_stream) { + lite_cxx_config.enable_xpu_multi_stream(); + } #endif #ifdef LITE_SUBGRAPH_WITH_NPU From 399047d7f113318d7f2a16dff3b560bdacb60c5f Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 1 Nov 2022 18:41:05 +0800 Subject: [PATCH 64/91] [PHI]Standardise some C++ API (Part2) (#47510) * standard_api * add hardtanh --- .../new_executor/standalone_executor_test.cc | 4 +- .../operators/detection/yolov3_loss_op.cc | 4 +- .../operators/hierarchical_sigmoid_op.cc | 2 +- paddle/fluid/operators/tril_triu_op.cc | 2 +- paddle/fluid/operators/where_index_op.cc | 2 +- paddle/phi/api/yaml/legacy_backward.yaml | 46 +- paddle/phi/api/yaml/legacy_ops.yaml | 92 +-- paddle/phi/infermeta/backward.cc | 36 +- paddle/phi/infermeta/backward.h | 36 +- paddle/phi/infermeta/multiary.cc | 60 +- paddle/phi/infermeta/multiary.h | 60 +- paddle/phi/infermeta/ternary.cc | 116 ++-- paddle/phi/infermeta/ternary.h | 16 +- paddle/phi/infermeta/unary.cc | 554 +++++++++--------- paddle/phi/infermeta/unary.h | 58 +- .../cpu/hierarchical_sigmoid_grad_kernel.cc | 71 --- ...al_sigmoid_grad.h => hsigmoid_loss_grad.h} | 41 +- .../kernels/cpu/hsigmoid_loss_grad_kernel.cc | 71 +++ ...moid_kernel.cc => hsigmoid_loss_kernel.cc} | 44 +- ...here_index_kernel.cc => nonzero_kernel.cc} | 12 +- ...rod_grad_kernel.cc => prod_grad_kernel.cc} | 6 +- .../{reduce_prod_kernel.cc => prod_kernel.cc} | 2 +- ...ril_triu_kernel.cc => tril_grad_kernel.cc} | 6 +- ...ril_triu_grad_kernel.cc => tril_kernel.cc} | 6 +- ...rnel.cc => uniform_inplace_grad_kernel.cc} | 24 +- ...ce_kernel.cc => uniform_inplace_kernel.cc} | 24 +- ...orm_random_kernel.cc => uniform_kernel.cc} | 26 +- ...ov3_loss_functor.h => yolo_loss_functor.h} | 0 ...rad_kernel.cc => yolo_loss_grad_kernel.cc} | 50 +- ...ov3_loss_kernel.cc => yolo_loss_kernel.cc} | 36 +- paddle/phi/kernels/gpu/lstsq_kernel.cu | 6 +- ...here_index_kernel.cu => nonzero_kernel.cu} | 12 +- ...rod_grad_kernel.cu => prod_grad_kernel.cu} | 6 +- paddle/phi/kernels/gpu/qr_kernel.cu | 6 +- ...ril_triu_kernel.cu => tril_grad_kernel.cu} | 6 +- ...ril_triu_grad_kernel.cu => tril_kernel.cu} | 6 +- ...rnel.cu => uniform_inplace_grad_kernel.cu} | 24 +- ...ce_kernel.cu => uniform_inplace_kernel.cu} | 24 +- ...orm_random_kernel.cu => uniform_kernel.cu} | 26 +- .../hierarchical_sigmoid_grad_kernel.h | 42 -- .../phi/kernels/hierarchical_sigmoid_kernel.h | 40 -- .../phi/kernels/hsigmoid_loss_grad_kernel.h | 42 ++ paddle/phi/kernels/hsigmoid_loss_kernel.h | 40 ++ ..._kernel_impl.h => prod_grad_kernel_impl.h} | 18 +- paddle/phi/kernels/impl/qr_grad_kernel_impl.h | 6 +- ..._kernel_impl.h => tril_grad_kernel_impl.h} | 12 +- ..._triu_kernel_impl.h => tril_kernel_impl.h} | 12 +- .../{reduce_prod_kernel.cu => prod_kernel.cu} | 2 +- ...{where_index_kernel.h => nonzero_kernel.h} | 6 +- ..._prod_grad_kernel.h => prod_grad_kernel.h} | 16 +- .../{reduce_prod_kernel.cc => prod_kernel.cc} | 2 +- .../{reduce_prod_kernel.h => prod_kernel.h} | 0 .../hierarchical_sigmoid_grad_kernel.cc | 99 ---- .../hierarchical_sigmoid_grad_kernel.h | 45 -- .../hsigmoid_loss_grad_kernel.cc | 99 ++++ .../selected_rows/hsigmoid_loss_grad_kernel.h | 45 ++ .../kernels/selected_rows/uniform_kernel.cc | 96 +++ ...iform_random_kernel.h => uniform_kernel.h} | 34 +- .../selected_rows/uniform_random_kernel.cc | 106 ---- ..._triu_grad_kernel.h => tril_grad_kernel.h} | 10 +- .../{tril_triu_kernel.h => tril_kernel.h} | 22 +- ...kernel.h => uniform_inplace_grad_kernel.h} | 18 +- ...grad_kernel.h => uniform_inplace_kernel.h} | 18 +- ...orm_random_kernel.cc => uniform_kernel.cc} | 30 +- ...iform_random_kernel.h => uniform_kernel.h} | 34 +- ...here_index_kernel.cc => nonzero_kernel.cc} | 10 +- .../{reduce_prod_kernel.cc => prod_kernel.cc} | 2 +- ...riu_grad_kernel.cc => tril_grad_kernel.cc} | 14 +- .../{tril_triu_kernel.cc => tril_kernel.cc} | 15 +- ...orm_random_kernel.cc => uniform_kernel.cc} | 26 +- paddle/phi/kernels/yolo_loss_grad_kernel.h | 42 ++ paddle/phi/kernels/yolo_loss_kernel.h | 38 ++ paddle/phi/kernels/yolov3_loss_grad_kernel.h | 42 -- paddle/phi/kernels/yolov3_loss_kernel.h | 38 -- .../ops/compat/hierarchical_sigmoid_sig.cc | 9 +- paddle/phi/ops/compat/tril_triu_sig.cc | 7 +- .../ops/compat/uniform_random_inplace_sig.cc | 6 +- paddle/phi/ops/compat/uniform_random_sig.cc | 30 +- paddle/phi/ops/compat/where_index_sig.cc | 27 + paddle/phi/ops/compat/yolov3_loss_sig.cc | 7 +- python/paddle/fluid/initializer.py | 6 +- python/paddle/fluid/layers/nn.py | 6 +- python/paddle/nn/functional/loss.py | 2 +- python/paddle/tensor/creation.py | 4 +- python/paddle/tensor/math.py | 2 +- python/paddle/tensor/random.py | 4 +- python/paddle/tensor/search.py | 2 +- python/paddle/vision/ops.py | 2 +- 88 files changed, 1437 insertions(+), 1421 deletions(-) delete mode 100644 paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc rename paddle/phi/kernels/cpu/{hierarchical_sigmoid_grad.h => hsigmoid_loss_grad.h} (71%) create mode 100644 paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc rename paddle/phi/kernels/cpu/{hierarchical_sigmoid_kernel.cc => hsigmoid_loss_kernel.cc} (72%) rename paddle/phi/kernels/cpu/{where_index_kernel.cc => nonzero_kernel.cc} (90%) rename paddle/phi/kernels/cpu/{reduce_prod_grad_kernel.cc => prod_grad_kernel.cc} (84%) rename paddle/phi/kernels/cpu/{reduce_prod_kernel.cc => prod_kernel.cc} (96%) rename paddle/phi/kernels/cpu/{tril_triu_kernel.cc => tril_grad_kernel.cc} (88%) rename paddle/phi/kernels/cpu/{tril_triu_grad_kernel.cc => tril_kernel.cc} (86%) rename paddle/phi/kernels/cpu/{uniform_random_inplace_grad_kernel.cc => uniform_inplace_grad_kernel.cc} (59%) rename paddle/phi/kernels/cpu/{uniform_random_inplace_kernel.cc => uniform_inplace_kernel.cc} (68%) rename paddle/phi/kernels/cpu/{uniform_random_kernel.cc => uniform_kernel.cc} (76%) rename paddle/phi/kernels/cpu/{yolov3_loss_functor.h => yolo_loss_functor.h} (100%) rename paddle/phi/kernels/cpu/{yolov3_loss_grad_kernel.cc => yolo_loss_grad_kernel.cc} (85%) rename paddle/phi/kernels/cpu/{yolov3_loss_kernel.cc => yolo_loss_kernel.cc} (93%) rename paddle/phi/kernels/gpu/{where_index_kernel.cu => nonzero_kernel.cu} (90%) rename paddle/phi/kernels/gpu/{reduce_prod_grad_kernel.cu => prod_grad_kernel.cu} (84%) rename paddle/phi/kernels/gpu/{tril_triu_kernel.cu => tril_grad_kernel.cu} (88%) rename paddle/phi/kernels/gpu/{tril_triu_grad_kernel.cu => tril_kernel.cu} (86%) rename paddle/phi/kernels/gpu/{uniform_random_inplace_grad_kernel.cu => uniform_inplace_grad_kernel.cu} (61%) rename paddle/phi/kernels/gpu/{uniform_random_inplace_kernel.cu => uniform_inplace_kernel.cu} (79%) rename paddle/phi/kernels/gpu/{uniform_random_kernel.cu => uniform_kernel.cu} (81%) delete mode 100644 paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h delete mode 100644 paddle/phi/kernels/hierarchical_sigmoid_kernel.h create mode 100644 paddle/phi/kernels/hsigmoid_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/hsigmoid_loss_kernel.h rename paddle/phi/kernels/impl/{reduce_prod_grad_kernel_impl.h => prod_grad_kernel_impl.h} (69%) rename paddle/phi/kernels/impl/{tril_triu_grad_kernel_impl.h => tril_grad_kernel_impl.h} (82%) rename paddle/phi/kernels/impl/{tril_triu_kernel_impl.h => tril_kernel_impl.h} (83%) rename paddle/phi/kernels/kps/{reduce_prod_kernel.cu => prod_kernel.cu} (96%) rename paddle/phi/kernels/{where_index_kernel.h => nonzero_kernel.h} (84%) rename paddle/phi/kernels/{reduce_prod_grad_kernel.h => prod_grad_kernel.h} (68%) rename paddle/phi/kernels/{reduce_prod_kernel.cc => prod_kernel.cc} (96%) rename paddle/phi/kernels/{reduce_prod_kernel.h => prod_kernel.h} (100%) delete mode 100644 paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc delete mode 100644 paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/uniform_kernel.cc rename paddle/phi/kernels/selected_rows/{uniform_random_kernel.h => uniform_kernel.h} (54%) delete mode 100644 paddle/phi/kernels/selected_rows/uniform_random_kernel.cc rename paddle/phi/kernels/{tril_triu_grad_kernel.h => tril_grad_kernel.h} (77%) rename paddle/phi/kernels/{tril_triu_kernel.h => tril_kernel.h} (66%) rename paddle/phi/kernels/{uniform_random_inplace_kernel.h => uniform_inplace_grad_kernel.h} (63%) rename paddle/phi/kernels/{uniform_random_inplace_grad_kernel.h => uniform_inplace_kernel.h} (60%) rename paddle/phi/kernels/{uniform_random_kernel.cc => uniform_kernel.cc} (68%) rename paddle/phi/kernels/{uniform_random_kernel.h => uniform_kernel.h} (54%) rename paddle/phi/kernels/xpu/{where_index_kernel.cc => nonzero_kernel.cc} (89%) rename paddle/phi/kernels/xpu/{reduce_prod_kernel.cc => prod_kernel.cc} (96%) rename paddle/phi/kernels/xpu/{tril_triu_grad_kernel.cc => tril_grad_kernel.cc} (81%) rename paddle/phi/kernels/xpu/{tril_triu_kernel.cc => tril_kernel.cc} (82%) rename paddle/phi/kernels/xpu/{uniform_random_kernel.cc => uniform_kernel.cc} (77%) create mode 100644 paddle/phi/kernels/yolo_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/yolo_loss_kernel.h delete mode 100644 paddle/phi/kernels/yolov3_loss_grad_kernel.h delete mode 100644 paddle/phi/kernels/yolov3_loss_kernel.h create mode 100644 paddle/phi/ops/compat/where_index_sig.cc diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index f625f133ed3f5..207ca7cf0828f 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -63,8 +63,8 @@ USE_OP_ITSELF(memcpy_d2h); USE_OP_ITSELF(fetch_v2); PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(uniform_random_raw, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(uniform_random, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(uniform_raw, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(uniform, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT); diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc index 5f6ffece3bf54..0b8fc79826f1c 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -218,10 +218,10 @@ class Yolov3LossGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(yolov3_loss, Yolov3LossInferShapeFunctor, - PD_INFER_META(phi::Yolov3LossInferMeta)); + PD_INFER_META(phi::YoloLossInferMeta)); DECLARE_INFER_SHAPE_FUNCTOR(yolov3_loss_grad, Yolov3LossGradInferShapeFunctor, - PD_INFER_META(phi::Yolov3LossGradInferMeta)); + PD_INFER_META(phi::YoloLossGradInferMeta)); REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 338b8af503673..8193be6b6b8e1 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -259,7 +259,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid, HierarchicalSigmoidInferShapeFunctor, - PD_INFER_META(phi::HierarchicalSigmoidInferMeta)); + PD_INFER_META(phi::HSigmoidLossInferMeta)); REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, ops::HierarchicalSigmoidOpMaker, diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 5d2c3c0797acf..97c9289295022 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -93,7 +93,7 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; DECLARE_INFER_SHAPE_FUNCTOR(tril_triu, TrilTriuInferShapeFunctor, - PD_INFER_META(phi::TrilTriuInferMeta)); + PD_INFER_META(phi::TrilInferMeta)); REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, diff --git a/paddle/fluid/operators/where_index_op.cc b/paddle/fluid/operators/where_index_op.cc index 7e5cc8fa53a51..52448b08c5e11 100644 --- a/paddle/fluid/operators/where_index_op.cc +++ b/paddle/fluid/operators/where_index_op.cc @@ -48,7 +48,7 @@ class WhereIndexOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(where_index, WhereIndexInferShapeFunctor, - PD_INFER_META(phi::WhereIndexInferMeta)); + PD_INFER_META(phi::NonZeroInferMeta)); REGISTER_OPERATOR( where_index, ops::WhereIndexOp, diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 922cb70d6e7e1..4e9a4abfcdb65 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -791,8 +791,8 @@ func : hard_tanh_grad inplace : (out_grad -> x_grad) -- backward_op : hierarchical_sigmoid_grad - forward : hierarchical_sigmoid (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out) +- backward_op : hsigmoid_loss_grad + forward : hsigmoid_loss (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out) args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) output : Tensor(x_grad), Tensor(w_grad), Tensor(bias_grad) infer_meta : @@ -800,7 +800,7 @@ param : [x ,w, bias] optional: path, code, bias kernel : - func : hierarchical_sigmoid_grad + func : hsigmoid_loss_grad - backward_op : huber_loss_grad forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual) @@ -1477,6 +1477,16 @@ kernel : func : prelu_grad +- backward_op : prod_grad + forward : prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : prod_grad + - backward_op : psroi_pool_grad forward : psroi_pool (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, int output_channels, float spatial_scale) -> Tensor(out) args : (Tensor x, Tensor boxes, Tensor boxes_num, Tensor out_grad, int pooled_height, int pooled_width, int output_channels, float spatial_scale) @@ -1516,16 +1526,6 @@ output : Tensor(x_grad) invoke : real_grad_impl(out_grad, x_grad) -- backward_op : reduce_prod_grad - forward : reduce_prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : prod_grad - - backward_op : relu6_grad forward : relu6 (Tensor x, float threshold) -> Tensor(out) args : (Tensor out, Tensor out_grad, float threshold) @@ -2234,15 +2234,15 @@ kernel : func : triangular_solve_grad -- backward_op : tril_triu_grad - forward : tril_triu(Tensor x, int diagonal, bool lower) -> Tensor(out) +- backward_op : tril_grad + forward : tril(Tensor x, int diagonal, bool lower) -> Tensor(out) args : (Tensor out_grad, int diagonal, bool lower) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta param : [out_grad] kernel : - func : tril_triu_grad + func : tril_grad - backward_op : trilinear_interp_grad forward : trilinear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) -> Tensor(output) @@ -2273,14 +2273,14 @@ func : unfold_grad no_need_buffer : x -- backward_op : uniform_random_inplace_grad - forward : uniform_random_inplace(Tensor x, float min, float max, int seed, int diag_num, int diag_step, float diag_val) -> Tensor(out) +- backward_op : uniform_inplace_grad + forward : uniform_inplace(Tensor x, float min, float max, int seed, int diag_num, int diag_step, float diag_val) -> Tensor(out) args : (Tensor out_grad, float min, float max, int seed, int diag_num, int diag_step, float diag_val) output : Tensor(x_grad) infer_meta : func : UniformRandomInplaceGradInferMeta kernel : - func : uniform_random_inplace_grad + func : uniform_inplace_grad inplace : (out_grad -> x_grad) - backward_op : unsqueeze_double_grad @@ -2335,14 +2335,14 @@ func : where_grad no_need_buffer : x, y -- backward_op : yolov3_loss_grad - forward : yolov3_loss(Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0) -> Tensor(loss), Tensor(objectness_mask), Tensor(gt_match_mask) +- backward_op : yolo_loss_grad + forward : yolo_loss(Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0) -> Tensor(loss), Tensor(objectness_mask), Tensor(gt_match_mask) args : (Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, Tensor objectness_mask, Tensor gt_match_mask, Tensor loss_grad, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0) output : Tensor(x_grad), Tensor(gt_box_grad), Tensor(gt_label_grad), Tensor(gt_score_grad) infer_meta : - func : Yolov3LossGradInferMeta + func : YoloLossGradInferMeta kernel : - func : yolov3_loss_grad + func : yolo_loss_grad optional : gt_score - backward_op: fold_grad diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index c42bc74461e5a..03dbb08b59fb1 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -1036,17 +1036,6 @@ func : hard_tanh backward : hardtanh_grad -- op : hierarchical_sigmoid - args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) - output : Tensor(out), Tensor(pre_out), Tensor(w_out) - infer_meta : - func : HierarchicalSigmoidInferMeta - optional: path, code, bias - kernel : - func : hierarchical_sigmoid - data_type : x - backward : hierarchical_sigmoid_grad - - op : histogram args : (Tensor input, int64_t bins, int min, int max) output : Tensor(out) @@ -1055,6 +1044,17 @@ kernel : func : histogram +- op : hsigmoid_loss + args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) + output : Tensor(out), Tensor(pre_out), Tensor(w_out) + infer_meta : + func : HSigmoidLossInferMeta + optional: path, code, bias + kernel : + func : hsigmoid_loss + data_type : x + backward : hsigmoid_loss_grad + - op : huber_loss args : (Tensor input, Tensor label, float delta) output : Tensor(out), Tensor(residual) @@ -1696,6 +1696,14 @@ func : nms data_type : x +- op : nonzero + args : (Tensor condition) + output : Tensor(out) + infer_meta : + func : NonZeroInferMeta + kernel : + func : nonzero + - op : norm args : (Tensor x, int axis, float epsilon, bool is_test) output : Tensor(out), Tensor(norm) @@ -1828,6 +1836,15 @@ kernel : func : prior_box +- op : prod + args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) + output : Tensor + infer_meta : + func : ReduceIntArrayAxisInferMetaBase + kernel : + func : prod_raw + backward : prod_grad + - op : psroi_pool args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, int output_channels, float spatial_scale) output : Tensor @@ -1893,15 +1910,6 @@ func : real backward : real_grad -- op : reduce_prod - args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) - output : Tensor - infer_meta : - func : ReduceIntArrayAxisInferMetaBase - kernel : - func : prod_raw - backward : reduce_prod_grad - - op : relu args : (Tensor x) output : Tensor(out) @@ -2460,6 +2468,15 @@ func : triangular_solve backward : triangular_solve_grad +- op : tril + args : (Tensor x, int diagonal, bool lower) + output : Tensor(out) + infer_meta : + func : TrilInferMeta + kernel : + func : tril + backward : tril_grad + - op : tril_indices args : (int rows, int cols, int offset, DataType dtype, Place place={}) output : Tensor(out) @@ -2472,15 +2489,6 @@ data_type : dtype backend : place -- op : tril_triu - args : (Tensor x, int diagonal, bool lower) - output : Tensor(out) - infer_meta : - func : TrilTriuInferMeta - kernel : - func : tril_triu - backward : tril_triu_grad - - op : trilinear_interp args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) output : Tensor(output) @@ -2535,14 +2543,14 @@ func : unfold backward : unfold_grad -- op : uniform_random +- op : uniform args : (IntArray shape, DataType dtype, Scalar min, Scalar max, int seed, Place place={}) output : Tensor(out) infer_meta : func : UniformRandomInferMeta param: [shape, dtype] kernel : - func : uniform_random + func : uniform param: [shape, dtype, min, max, seed] data_type : dtype backend : place @@ -2628,14 +2636,6 @@ func : where backward : where_grad -- op : where_index - args : (Tensor condition) - output : Tensor(out) - infer_meta : - func : WhereIndexInferMeta - kernel : - func : where_index - - op : yolo_box args : (Tensor x, Tensor img_size, int[] anchors, int class_num, float conf_thresh, int downsample_ratio, bool clip_bbox, float scale_x_y=1.0, bool iou_aware=false, float iou_aware_factor=0.5) output : Tensor(boxes), Tensor(scores) @@ -2645,16 +2645,16 @@ func : yolo_box data_type : x -- op : yolov3_loss +- op : yolo_loss args : (Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0) output : Tensor(loss), Tensor(objectness_mask), Tensor(gt_match_mask) infer_meta : - func : Yolov3LossInferMeta + func : YoloLossInferMeta kernel : - func : yolov3_loss + func : yolo_loss data_type : x optional : gt_score - backward : yolov3_loss_grad + backward : yolo_loss_grad - op : zeros args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) @@ -2734,16 +2734,16 @@ intermediate : reserve view : (dropout_state_in -> dropout_state_out) -- op: uniform_random_inplace +- op: uniform_inplace args: (Tensor x, float min, float max, int seed, int diag_num, int diag_step, float diag_val) output: Tensor(out) infer_meta: func: UniformRandomInplaceInferMeta kernel: - func: uniform_random_inplace + func: uniform_inplace data_type: x inplace: (x -> out) - backward: uniform_random_inplace_grad + backward: uniform_inplace_grad - op: unpool args: (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, IntArray output_size, str data_format) diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 6f8a60c9232fa..ef2f384cb6ac7 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -987,24 +987,24 @@ void UnStackGradInferMeta(const std::vector& out_grad, x_grad->set_dtype(out_grad[0]->dtype()); } -void Yolov3LossGradInferMeta(const MetaTensor& x, - const MetaTensor& gt_box, - const MetaTensor& gt_label, - const MetaTensor& gt_score, - const MetaTensor& objectness_mask, - const MetaTensor& gt_match_mask, - const MetaTensor& loss_grad, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_y, - MetaTensor* x_grad, - MetaTensor* gt_box_grad, - MetaTensor* gt_label_grad, - MetaTensor* gt_score_grad) { +void YoloLossGradInferMeta(const MetaTensor& x, + const MetaTensor& gt_box, + const MetaTensor& gt_label, + const MetaTensor& gt_score, + const MetaTensor& objectness_mask, + const MetaTensor& gt_match_mask, + const MetaTensor& loss_grad, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_y, + MetaTensor* x_grad, + MetaTensor* gt_box_grad, + MetaTensor* gt_label_grad, + MetaTensor* gt_score_grad) { if (x_grad) { x_grad->set_dims(x.dims()); x_grad->set_dtype(x.dtype()); diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index dd86055978a99..01cdc8023a148 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -385,24 +385,24 @@ void UnStackGradInferMeta(const std::vector& out_grad, int axis, MetaTensor* x_grad); -void Yolov3LossGradInferMeta(const MetaTensor& x, - const MetaTensor& gt_box, - const MetaTensor& gt_label, - const MetaTensor& gt_score, - const MetaTensor& objectness_mask, - const MetaTensor& gt_match_mask, - const MetaTensor& loss_grad, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_y, - MetaTensor* x_grad, - MetaTensor* gt_box_grad, - MetaTensor* gt_label_grad, - MetaTensor* gt_score_grad); +void YoloLossGradInferMeta(const MetaTensor& x, + const MetaTensor& gt_box, + const MetaTensor& gt_label, + const MetaTensor& gt_score, + const MetaTensor& objectness_mask, + const MetaTensor& gt_match_mask, + const MetaTensor& loss_grad, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_y, + MetaTensor* x_grad, + MetaTensor* gt_box_grad, + MetaTensor* gt_label_grad, + MetaTensor* gt_score_grad); void IndexAddGradInferMeta(const MetaTensor& index, const MetaTensor& add_value, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 91d2642139a2c..52050f160e24e 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1328,22 +1328,22 @@ void GraphSampleNeighborsInferMeta(const MetaTensor& row, out_count->set_dtype(DataType::INT32); } -void HierarchicalSigmoidInferMeta(const MetaTensor& x, - const MetaTensor& w, - const MetaTensor& label, - const MetaTensor& path, - const MetaTensor& code, - const MetaTensor& bias, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - MetaTensor* out, - MetaTensor* pre_out, - MetaTensor* w_out) { +void HSigmoidLossInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + const MetaTensor& path, + const MetaTensor& code, + const MetaTensor& bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out) { const int64_t input_dims = x.dims()[0]; const int64_t label_dims = label.dims()[0]; PADDLE_ENFORCE_EQ(input_dims, @@ -2762,20 +2762,20 @@ void WhereInferMeta(const MetaTensor& condition, out->share_meta(x); } -void Yolov3LossInferMeta(const MetaTensor& x, - const MetaTensor& gt_box, - const MetaTensor& gt_label, - const MetaTensor& gt_score, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_y, - MetaTensor* loss, - MetaTensor* objectness_mask, - MetaTensor* gt_match_mask) { +void YoloLossInferMeta(const MetaTensor& x, + const MetaTensor& gt_box, + const MetaTensor& gt_label, + const MetaTensor& gt_score, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_y, + MetaTensor* loss, + MetaTensor* objectness_mask, + MetaTensor* gt_match_mask) { auto dim_x = x.dims(); auto dim_gtbox = gt_box.dims(); auto dim_gtlabel = gt_label.dims(); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 0dfb1307c02a8..79926e06b2b2e 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -288,22 +288,22 @@ void GraphSampleNeighborsInferMeta(const MetaTensor& row, MetaTensor* out_count, MetaTensor* out_eids); -void HierarchicalSigmoidInferMeta(const MetaTensor& x, - const MetaTensor& w, - const MetaTensor& label, - const MetaTensor& path, - const MetaTensor& code, - const MetaTensor& bias, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - MetaTensor* out, - MetaTensor* pre_out, - MetaTensor* w_out); +void HSigmoidLossInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + const MetaTensor& path, + const MetaTensor& code, + const MetaTensor& bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out); void InterpolateInferMeta( const MetaTensor& x, @@ -508,19 +508,19 @@ void WhereInferMeta(const MetaTensor& condition, const MetaTensor& y, MetaTensor* out); -void Yolov3LossInferMeta(const MetaTensor& x, - const MetaTensor& gt_box, - const MetaTensor& gt_label, - const MetaTensor& gt_score, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_y, - MetaTensor* loss, - MetaTensor* objectness_mask, - MetaTensor* gt_match_mask); +void YoloLossInferMeta(const MetaTensor& x, + const MetaTensor& gt_box, + const MetaTensor& gt_label, + const MetaTensor& gt_score, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_y, + MetaTensor* loss, + MetaTensor* objectness_mask, + MetaTensor* gt_match_mask); } // namespace phi diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 874432aedd573..c8310707351a7 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -402,64 +402,6 @@ void InstanceNormInferMeta(const MetaTensor& x, } } -void SendURecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& reduce_op, - const IntArray& out_size, - MetaTensor* out, - MetaTensor* dst_count) { - auto src_index_dims = src_index.dims(); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), - 1, - phi::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = dst_index.dims(); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), - 1, - phi::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ(src_index_dims[0], - dst_index_dims[0], - phi::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = x.dims(); - std::vector dims_ = phi::vectorize(dims); - dims_[0] = -1; - out->set_dims(phi::make_ddim(dims_)); - out->set_dtype(x.dtype()); - - if (reduce_op == "MEAN") { - dst_count->set_dims({-1}); - dst_count->set_dtype(DataType::INT32); - } -} - void GroupNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, @@ -1164,6 +1106,64 @@ void ScatterNdAddInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void SendURecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + MetaTensor* out, + MetaTensor* dst_count) { + auto src_index_dims = src_index.dims(); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Src_index should be 1 when it " + "is 2D, but we get %d", + src_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + src_index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); + } + + auto dst_index_dims = dst_index.dims(); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Dst_index should be 1 when it " + "is 2D, but we get %d", + dst_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dst_index_dims.size(), + 1, + phi::errors::InvalidArgument("The Dst_index should be 1D, " + "when it is not 2D, but we get %d", + dst_index_dims.size())); + } + + PADDLE_ENFORCE_EQ(src_index_dims[0], + dst_index_dims[0], + phi::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + + auto dims = x.dims(); + std::vector dims_ = phi::vectorize(dims); + dims_[0] = -1; + out->set_dims(phi::make_ddim(dims_)); + out->set_dtype(x.dtype()); + + if (reduce_op == "MEAN") { + dst_count->set_dims({-1}); + dst_count->set_dtype(DataType::INT32); + } +} + void SpectralNormInferMeta(const MetaTensor& weight, const MetaTensor& u, const MetaTensor& v, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index e0b1573e16679..1d0e7e8744dc1 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -72,14 +72,6 @@ void InstanceNormInferMeta(const MetaTensor& x, MetaTensor* saved_variance, MetaConfig config = MetaConfig()); -void SendURecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& reduce_op, - const IntArray& out_size, - MetaTensor* out, - MetaTensor* dst_count); - void GroupNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, @@ -186,6 +178,14 @@ void ScatterNdAddInferMeta(const MetaTensor& x, const MetaTensor& updates, MetaTensor* out); +void SendURecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& reduce_op, + const IntArray& out_size, + MetaTensor* out, + MetaTensor* dst_count); + void SpectralNormInferMeta(const MetaTensor& weight, const MetaTensor& u, const MetaTensor& v, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index d83477de96181..bfc769032d404 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1204,6 +1204,211 @@ void FlipInferMeta(const MetaTensor& x, out->share_lod(x); } +void FoldInferMeta(const MetaTensor& x, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + MetaTensor* out) { + auto in_dims = x.dims(); + + PADDLE_ENFORCE_EQ( + output_sizes.size(), + 2, + phi::errors::InvalidArgument( + "It is expected output_size equals to 2, but got size %d", + output_sizes.size())); + PADDLE_ENFORCE_EQ( + kernel_sizes.size(), + 2, + phi::errors::InvalidArgument( + "It is expected kernel_size equals to 2, but got size %d", + kernel_sizes.size())); + PADDLE_ENFORCE_EQ( + strides.size(), + 2, + phi::errors::InvalidArgument( + "It is expected strides_size equals to 2, but got size %d", + strides.size())); + PADDLE_ENFORCE_EQ( + paddings.size(), + 4, + phi::errors::InvalidArgument( + "It is expected paddings_size equals to 4, but got size %d", + paddings.size())); + + PADDLE_ENFORCE_EQ( + dilations.size(), + 2, + phi::errors::InvalidArgument( + "It is expected dilations_size equals to 2, but got size %d", + dilations.size())); + + int output_height = output_sizes[0]; + int output_width = output_sizes[1]; + int kernel_height = kernel_sizes[0]; + int kernel_width = kernel_sizes[1]; + int dilation_height = dilations[0]; + int dilation_width = dilations[1]; + int stride_height = strides[0]; + int stride_width = strides[1]; + + // check kernel_sizes + PADDLE_ENFORCE_GT(kernel_height, + 0, + phi::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but received kernel_height: %d kernel_width: %d.", + kernel_sizes[0], + kernel_sizes[1])); + PADDLE_ENFORCE_GT(kernel_width, + 0, + phi::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but received kernel_height: %d kernel_width: %d.", + kernel_sizes[0], + kernel_sizes[1])); + // check strides + PADDLE_ENFORCE_GT(stride_height, + 0, + phi::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but received strides_height: %d strides_width: %d.", + strides[0], + strides[1])); + PADDLE_ENFORCE_GT(stride_width, + 0, + phi::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but received strides_height: %d strides_width: %d.", + strides[0], + strides[1])); + // check dilations + PADDLE_ENFORCE_GT(output_height, + 1, + phi::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but received output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, + 1, + phi::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but received output_width: %d .", + output_width)); + // check output size + PADDLE_ENFORCE_GT( + dilation_height, + 0, + phi::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but received dilations_height: %d dilations_width: %d.", + dilations[0], + dilations[1])); + PADDLE_ENFORCE_GT( + dilation_width, + 0, + phi::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but received dilations_height: %d dilations_width: %d.", + dilations[0], + dilations[1])); + + std::vector out_dims; + // batch_size + out_dims.push_back(in_dims[0]); + // output_plane + int output_channels = in_dims[1] / (kernel_width * kernel_height); + out_dims.push_back(output_channels); + + int blocks_height = (output_sizes[0] + 2 * paddings[0] - + (dilations[0] * (kernel_sizes[0] - 1) + 1)) / + strides[0] + + 1; + int blocks_width = (output_sizes[1] + 2 * paddings[1] - + (dilations[1] * (kernel_sizes[1] - 1) + 1)) / + strides[1] + + 1; + + // check output height and width + PADDLE_ENFORCE_GT( + blocks_height, + 0, + phi::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "is (%d, %d), which should be a positive integer.", + in_dims[2], + in_dims[3], + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + output_height, + output_width)); + + PADDLE_ENFORCE_GT( + blocks_width, + 0, + phi::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "is (%d, %d), which should be a positive integer.", + in_dims[2], + in_dims[3], + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + output_height, + output_width)); + + PADDLE_ENFORCE_EQ( + blocks_height * blocks_width, + in_dims[2], + phi::errors::InvalidArgument( + "Given input output_size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "which should be expected size of input's dimension " + "2 to match the calculated number of %d * %d = %d, but got %d", + output_height, + output_width, + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + blocks_height, + blocks_width, + blocks_height * blocks_width, + in_dims[2])); + + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), + 0, + phi::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], + kernel_sizes[0], + kernel_sizes[1])); + + out_dims.push_back(output_height); + out_dims.push_back(output_width); + if (out != nullptr) { + out->set_dims(phi::make_ddim(out_dims)); + out->set_dtype(x.dtype()); + } +} + void FrameInferMeta(const MetaTensor& x, int frame_length, int hop_length, @@ -1327,6 +1532,18 @@ void HistogramInferMeta( out->share_lod(input); } +void IdentityLossInferMeta(const MetaTensor& x, + int reduction, + MetaTensor* out) { + if (reduction == 2) { + out->set_dtype(x.dtype()); + out->set_dims(x.dims()); + } else { + out->set_dims(phi::make_ddim({1})); + out->set_dtype(x.dtype()); + } +} + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { PADDLE_ENFORCE_EQ( product(x.dims()), @@ -2036,6 +2253,17 @@ void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) { out->set_dtype(DataType::INT64); } +void NonZeroInferMeta(const MetaTensor& condition, MetaTensor* out) { + auto rank = condition.dims().size(); + PADDLE_ENFORCE_GE( + rank, + 1UL, + phi::errors::InvalidArgument( + "Input(Condition) should have number of dimension at least 1")); + out->set_dims(phi::make_ddim({-1, rank})); + out->set_dtype(DataType::INT64); +} + void NormInferMeta(const MetaTensor& x, int axis, float epsilon, @@ -2054,16 +2282,53 @@ void NormInferMeta(const MetaTensor& x, } } -void OverlapAddInferMeta(const MetaTensor& x, - int hop_length, - int axis, - MetaTensor* out, - MetaConfig config) { - const auto x_dims = x.dims(); - const int x_rank = x_dims.size(); +void OneHotRawInferMeta(const MetaTensor& x, + const Scalar& depth, + DataType dtype, + bool allow_out_of_range, + MetaTensor* out) { + auto x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 1, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); + auto out_dims_vec = phi::vectorize(x_dims); + out_dims_vec.push_back(depth.to()); + auto out_dims = phi::make_ddim(out_dims_vec); + out->set_dims(out_dims); + out->share_lod(x); + out->set_dtype(dtype); +} +void OneHotInferMeta(const MetaTensor& x, + const Scalar& depth_t, + MetaTensor* out) { + auto x_dims = x.dims(); PADDLE_ENFORCE_GE( - x_rank, + x_dims.size(), + 1, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); + + int depth = depth_t.to(); + auto out_dims_vec = phi::vectorize(x_dims); + out_dims_vec.push_back(depth); + auto out_dims = phi::make_ddim(out_dims_vec); + out->set_dims(out_dims); + out->share_lod(x); + + out->set_dtype(phi::DataType::FLOAT32); +} + +void OverlapAddInferMeta(const MetaTensor& x, + int hop_length, + int axis, + MetaTensor* out, + MetaConfig config) { + const auto x_dims = x.dims(); + const int x_rank = x_dims.size(); + + PADDLE_ENFORCE_GE( + x_rank, 2, errors::InvalidArgument( "Input(X) of OverlapAddOp should be a tensor which contains " @@ -3956,10 +4221,10 @@ void UnbindInferMeta(const MetaTensor& x, } } -void TrilTriuInferMeta(const MetaTensor& x, - int diagonal, - bool lower, - MetaTensor* out) { +void TrilInferMeta(const MetaTensor& x, + int diagonal, + bool lower, + MetaTensor* out) { const auto& x_dims = x.dims(); PADDLE_ENFORCE_GE(x_dims.size(), 2, @@ -4442,54 +4707,6 @@ void UnStackInferMeta(const MetaTensor& x, } } -void OneHotRawInferMeta(const MetaTensor& x, - const Scalar& depth, - DataType dtype, - bool allow_out_of_range, - MetaTensor* out) { - auto x_dims = x.dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), - 1, - phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); - auto out_dims_vec = phi::vectorize(x_dims); - out_dims_vec.push_back(depth.to()); - auto out_dims = phi::make_ddim(out_dims_vec); - out->set_dims(out_dims); - out->share_lod(x); - out->set_dtype(dtype); -} - -void OneHotInferMeta(const MetaTensor& x, - const Scalar& depth_t, - MetaTensor* out) { - auto x_dims = x.dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), - 1, - phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); - - int depth = depth_t.to(); - auto out_dims_vec = phi::vectorize(x_dims); - out_dims_vec.push_back(depth); - auto out_dims = phi::make_ddim(out_dims_vec); - out->set_dims(out_dims); - out->share_lod(x); - - out->set_dtype(phi::DataType::FLOAT32); -} - -void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { - auto rank = condition.dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1UL, - phi::errors::InvalidArgument( - "Input(Condition) should have number of dimension at least 1")); - out->set_dims(phi::make_ddim({-1, rank})); - out->set_dtype(DataType::INT64); -} - void ChannelShuffleInferMeta(const MetaTensor& x, int groups, const std::string& data_format, @@ -4536,223 +4753,6 @@ void ChannelShuffleInferMeta(const MetaTensor& x, out->set_dims(output_dims); } -void IdentityLossInferMeta(const MetaTensor& x, - int reduction, - MetaTensor* out) { - if (reduction == 2) { - out->set_dtype(x.dtype()); - out->set_dims(x.dims()); - } else { - out->set_dims(phi::make_ddim({1})); - out->set_dtype(x.dtype()); - } -} - -void FoldInferMeta(const MetaTensor& x, - const std::vector& output_sizes, - const std::vector& kernel_sizes, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - MetaTensor* out) { - auto in_dims = x.dims(); - - PADDLE_ENFORCE_EQ( - output_sizes.size(), - 2, - phi::errors::InvalidArgument( - "It is expected output_size equals to 2, but got size %d", - output_sizes.size())); - PADDLE_ENFORCE_EQ( - kernel_sizes.size(), - 2, - phi::errors::InvalidArgument( - "It is expected kernel_size equals to 2, but got size %d", - kernel_sizes.size())); - PADDLE_ENFORCE_EQ( - strides.size(), - 2, - phi::errors::InvalidArgument( - "It is expected strides_size equals to 2, but got size %d", - strides.size())); - PADDLE_ENFORCE_EQ( - paddings.size(), - 4, - phi::errors::InvalidArgument( - "It is expected paddings_size equals to 4, but got size %d", - paddings.size())); - - PADDLE_ENFORCE_EQ( - dilations.size(), - 2, - phi::errors::InvalidArgument( - "It is expected dilations_size equals to 2, but got size %d", - dilations.size())); - - int output_height = output_sizes[0]; - int output_width = output_sizes[1]; - int kernel_height = kernel_sizes[0]; - int kernel_width = kernel_sizes[1]; - int dilation_height = dilations[0]; - int dilation_width = dilations[1]; - int stride_height = strides[0]; - int stride_width = strides[1]; - - // check kernel_sizes - PADDLE_ENFORCE_GT(kernel_height, - 0, - phi::errors::InvalidArgument( - "The `kernel_sizes` should be greater than zero, " - "but received kernel_height: %d kernel_width: %d.", - kernel_sizes[0], - kernel_sizes[1])); - PADDLE_ENFORCE_GT(kernel_width, - 0, - phi::errors::InvalidArgument( - "The `kernel_sizes` should be greater than zero, " - "but received kernel_height: %d kernel_width: %d.", - kernel_sizes[0], - kernel_sizes[1])); - // check strides - PADDLE_ENFORCE_GT(stride_height, - 0, - phi::errors::InvalidArgument( - "The `strides` should be greater than zero, " - "but received strides_height: %d strides_width: %d.", - strides[0], - strides[1])); - PADDLE_ENFORCE_GT(stride_width, - 0, - phi::errors::InvalidArgument( - "The `strides` should be greater than zero, " - "but received strides_height: %d strides_width: %d.", - strides[0], - strides[1])); - // check dilations - PADDLE_ENFORCE_GT(output_height, - 1, - phi::errors::InvalidArgument( - "The `output_height` should be greater than one, " - "but received output_height: %d .", - output_height)); - PADDLE_ENFORCE_GT(output_width, - 1, - phi::errors::InvalidArgument( - "The `output_width` should be greater than one, " - "but received output_width: %d .", - output_width)); - // check output size - PADDLE_ENFORCE_GT( - dilation_height, - 0, - phi::errors::InvalidArgument( - "The `dilations` should be greater than zero, " - "but received dilations_height: %d dilations_width: %d.", - dilations[0], - dilations[1])); - PADDLE_ENFORCE_GT( - dilation_width, - 0, - phi::errors::InvalidArgument( - "The `dilations` should be greater than zero, " - "but received dilations_height: %d dilations_width: %d.", - dilations[0], - dilations[1])); - - std::vector out_dims; - // batch_size - out_dims.push_back(in_dims[0]); - // output_plane - int output_channels = in_dims[1] / (kernel_width * kernel_height); - out_dims.push_back(output_channels); - - int blocks_height = (output_sizes[0] + 2 * paddings[0] - - (dilations[0] * (kernel_sizes[0] - 1) + 1)) / - strides[0] + - 1; - int blocks_width = (output_sizes[1] + 2 * paddings[1] - - (dilations[1] * (kernel_sizes[1] - 1) + 1)) / - strides[1] + - 1; - - // check output height and width - PADDLE_ENFORCE_GT( - blocks_height, - 0, - phi::errors::InvalidArgument( - "The sliding blocks calculated from input spatial size (%d, %d), " - "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " - "is (%d, %d), which should be a positive integer.", - in_dims[2], - in_dims[3], - kernel_sizes[0], - kernel_sizes[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - output_height, - output_width)); - - PADDLE_ENFORCE_GT( - blocks_width, - 0, - phi::errors::InvalidArgument( - "The sliding blocks calculated from input spatial size (%d, %d), " - "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " - "is (%d, %d), which should be a positive integer.", - in_dims[2], - in_dims[3], - kernel_sizes[0], - kernel_sizes[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - output_height, - output_width)); - - PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, - in_dims[2], - phi::errors::InvalidArgument( - "Given input output_size (%d, %d), " - "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " - "which should be expected size of input's dimension " - "2 to match the calculated number of %d * %d = %d, but got %d", - output_height, - output_width, - kernel_sizes[0], - kernel_sizes[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - blocks_height, - blocks_width, - blocks_height * blocks_width, - in_dims[2])); - - PADDLE_ENFORCE_EQ( - in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), - 0, - phi::errors::InvalidArgument( - "Expected size of input's dimension 1 to be divisible by the" - "product of kernel_size, but got input.size(1)=%d and " - "kernel_size=( %d" - ", %d).", - in_dims[1], - kernel_sizes[0], - kernel_sizes[1])); - - out_dims.push_back(output_height); - out_dims.push_back(output_width); - if (out != nullptr) { - out->set_dims(phi::make_ddim(out_dims)); - out->set_dtype(x.dtype()); - } -} - } // namespace phi PD_REGISTER_INFER_META_FN(flatten, phi::FlattenInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 66f72a681e2a0..153b2b8f5f217 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -65,6 +65,11 @@ void BatchSizeLikeInferMeta(const MetaTensor& x, void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); +void ChannelShuffleInferMeta(const MetaTensor& x, + int groups, + const std::string& data_format, + MetaTensor* out); + void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); void ClassCenterSampleInferMeta(const MetaTensor& label, @@ -191,6 +196,14 @@ void FlipInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); +void FoldInferMeta(const MetaTensor& x, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + MetaTensor* out); + void FrameInferMeta(const MetaTensor& x, int frame_length, int hop_length, @@ -214,6 +227,8 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, void HistogramInferMeta( const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out); +void IdentityLossInferMeta(const MetaTensor& x, int reduction, MetaTensor* out); + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); void InferMetaFromVecValue(const MetaTensor& x, @@ -288,6 +303,8 @@ void NanmedianInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* median_index); +void NonZeroInferMeta(const MetaTensor& condition, MetaTensor* out); + void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out); void NormInferMeta(const MetaTensor& x, @@ -297,6 +314,14 @@ void NormInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* norm); +void OneHotRawInferMeta(const MetaTensor& x, + const Scalar& depth, + DataType dtype, + bool allow_out_of_range, + MetaTensor* out); + +void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out); + void OverlapAddInferMeta(const MetaTensor& x, int hop_length, int axis, @@ -576,10 +601,10 @@ void TransposeGradInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); -void TrilTriuInferMeta(const MetaTensor& x, - int diagonal, - bool lower, - MetaTensor* out); +void TrilInferMeta(const MetaTensor& x, + int diagonal, + bool lower, + MetaTensor* out); void UnbindInferMeta(const MetaTensor& x, int axis, @@ -657,29 +682,4 @@ void UnStackInferMeta(const MetaTensor& x, int num, std::vector outs); -void OneHotRawInferMeta(const MetaTensor& x, - const Scalar& depth, - DataType dtype, - bool allow_out_of_range, - MetaTensor* out); - -void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out); - -void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out); - -void ChannelShuffleInferMeta(const MetaTensor& x, - int groups, - const std::string& data_format, - MetaTensor* out); - -void IdentityLossInferMeta(const MetaTensor& x, int reduction, MetaTensor* out); - -void FoldInferMeta(const MetaTensor& x, - const std::vector& output_sizes, - const std::vector& kernel_sizes, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - MetaTensor* out); - } // namespace phi diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc deleted file mode 100644 index eee4525293f3f..0000000000000 --- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" - -namespace phi { - -template -void HierarchicalSigmoidGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - const DenseTensor& pre_out, - const DenseTensor& out_grad, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* x_grad, - DenseTensor* w_grad, - DenseTensor* bias_grad) { - HierarchicalSigmoidGradKernelImpl(ctx, - x, - w, - label, - path, - code, - bias, - pre_out, - out_grad, - num_classes, - remote_prefetch, - trainer_id, - height_sections, - epmap, - table_names, - is_sparse, - x_grad, - w_grad, - bias_grad); -} - -} // namespace phi - -PD_REGISTER_KERNEL(hierarchical_sigmoid_grad, - CPU, - ALL_LAYOUT, - phi::HierarchicalSigmoidGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h similarity index 71% rename from paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h rename to paddle/phi/kernels/cpu/hsigmoid_loss_grad.h index 9b38095f25f75..7e6693c4dd7a1 100644 --- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h +++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h @@ -26,27 +26,26 @@ namespace phi { namespace math = paddle::operators::math; template -void HierarchicalSigmoidGradKernelImpl( - const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - const DenseTensor& pre_out, - const DenseTensor& out_grad, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* x_grad, - DenseTensor* w_grad, - DenseTensor* bias_grad, - SelectedRows* w_grad_sr = nullptr) { +void HSigmoidLossGradKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad, + SelectedRows* w_grad_sr = nullptr) { funcs::SetConstant zero; DenseTensor pre_out_grad; diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc new file mode 100644 index 0000000000000..efb59d1f48267 --- /dev/null +++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hsigmoid_loss_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hsigmoid_loss_grad.h" + +namespace phi { + +template +void HSigmoidLossGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad) { + HSigmoidLossGradKernelImpl(ctx, + x, + w, + label, + path, + code, + bias, + pre_out, + out_grad, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad, + bias_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(hsigmoid_loss_grad, + CPU, + ALL_LAYOUT, + phi::HSigmoidLossGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc similarity index 72% rename from paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc rename to paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc index 7c3421e88d449..fa0c83031d904 100644 --- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc +++ b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h" +#include "paddle/phi/kernels/hsigmoid_loss_kernel.h" #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/platform/transform.h" @@ -28,23 +28,23 @@ namespace phi { namespace math = paddle::operators::math; template -void HierarchicalSigmoidKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* out, - DenseTensor* pre_out, - DenseTensor* w_out) { +void HSigmoidLossKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out) { size_t num_classes_st = static_cast(num_classes); // for remote prefetch @@ -106,9 +106,5 @@ void HierarchicalSigmoidKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(hierarchical_sigmoid, - CPU, - ALL_LAYOUT, - phi::HierarchicalSigmoidKernel, - float, - double) {} +PD_REGISTER_KERNEL( + hsigmoid_loss, CPU, ALL_LAYOUT, phi::HSigmoidLossKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/where_index_kernel.cc b/paddle/phi/kernels/cpu/nonzero_kernel.cc similarity index 90% rename from paddle/phi/kernels/cpu/where_index_kernel.cc rename to paddle/phi/kernels/cpu/nonzero_kernel.cc index da6eff74011ea..fca8e6b09fc28 100644 --- a/paddle/phi/kernels/cpu/where_index_kernel.cc +++ b/paddle/phi/kernels/cpu/nonzero_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/where_index_kernel.h" +#include "paddle/phi/kernels/nonzero_kernel.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" @@ -47,9 +47,9 @@ struct WhereIndexFunctor { }; template -void WhereIndexKernel(const Context& dev_ctx, - const DenseTensor& condition, - DenseTensor* out) { +void NonZeroKernel(const Context& dev_ctx, + const DenseTensor& condition, + DenseTensor* out) { const T* cond_data = condition.data(); auto numel = condition.numel(); auto dims = condition.dims(); @@ -83,10 +83,10 @@ void WhereIndexKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(where_index, +PD_REGISTER_KERNEL(nonzero, CPU, ALL_LAYOUT, - phi::WhereIndexKernel, + phi::NonZeroKernel, int64_t, int, int16_t, diff --git a/paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc b/paddle/phi/kernels/cpu/prod_grad_kernel.cc similarity index 84% rename from paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc rename to paddle/phi/kernels/cpu/prod_grad_kernel.cc index bec6deb907a0d..f602a8e078350 100644 --- a/paddle/phi/kernels/cpu/reduce_prod_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/prod_grad_kernel.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_grad_kernel.h" +#include "paddle/phi/kernels/prod_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/prod_grad_kernel_impl.h" PD_REGISTER_KERNEL(prod_grad, CPU, ALL_LAYOUT, - phi::ReduceProdGradKernel, + phi::ProdGradKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/prod_kernel.cc similarity index 96% rename from paddle/phi/kernels/cpu/reduce_prod_kernel.cc rename to paddle/phi/kernels/cpu/prod_kernel.cc index 36766d27ed434..af5ea5cb9568d 100644 --- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc +++ b/paddle/phi/kernels/cpu/prod_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/kernels/prod_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_grad_kernel.cc similarity index 88% rename from paddle/phi/kernels/cpu/tril_triu_kernel.cc rename to paddle/phi/kernels/cpu/tril_grad_kernel.cc index f3599bb92b97b..fba457424fc05 100644 --- a/paddle/phi/kernels/cpu/tril_triu_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_grad_kernel.cc @@ -14,12 +14,12 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_grad_kernel_impl.h" -PD_REGISTER_KERNEL(tril_triu, +PD_REGISTER_KERNEL(tril_grad, CPU, ALL_LAYOUT, - phi::TrilTriuKernel, + phi::TrilGradKernel, bool, float, double, diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_kernel.cc similarity index 86% rename from paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc rename to paddle/phi/kernels/cpu/tril_kernel.cc index 660254fef86f6..82902a1977297 100644 --- a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_kernel.cc @@ -14,12 +14,12 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_kernel_impl.h" -PD_REGISTER_KERNEL(tril_triu_grad, +PD_REGISTER_KERNEL(tril, CPU, ALL_LAYOUT, - phi::TrilTriuGradKernel, + phi::TrilKernel, bool, float, double, diff --git a/paddle/phi/kernels/cpu/uniform_random_inplace_grad_kernel.cc b/paddle/phi/kernels/cpu/uniform_inplace_grad_kernel.cc similarity index 59% rename from paddle/phi/kernels/cpu/uniform_random_inplace_grad_kernel.cc rename to paddle/phi/kernels/cpu/uniform_inplace_grad_kernel.cc index d448312949eb5..6358ff9962aa3 100644 --- a/paddle/phi/kernels/cpu/uniform_random_inplace_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/uniform_inplace_grad_kernel.cc @@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/uniform_random_inplace_grad_kernel.h" +#include "paddle/phi/kernels/uniform_inplace_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { template -void UniformRandomInplaceGradKernel(const Context& ctx, - const DenseTensor& out_grad, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* x_grad) { +void UniformInplaceGradKernel(const Context& ctx, + const DenseTensor& out_grad, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* x_grad) { if (x_grad) { auto* data = ctx.template Alloc(x_grad); std::fill(data, data + x_grad->numel(), T(0)); @@ -36,9 +36,9 @@ void UniformRandomInplaceGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(uniform_random_inplace_grad, +PD_REGISTER_KERNEL(uniform_inplace_grad, CPU, ALL_LAYOUT, - phi::UniformRandomInplaceGradKernel, + phi::UniformInplaceGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/uniform_random_inplace_kernel.cc b/paddle/phi/kernels/cpu/uniform_inplace_kernel.cc similarity index 68% rename from paddle/phi/kernels/cpu/uniform_random_inplace_kernel.cc rename to paddle/phi/kernels/cpu/uniform_inplace_kernel.cc index 6e687fbf54341..b6801e4caf51b 100644 --- a/paddle/phi/kernels/cpu/uniform_random_inplace_kernel.cc +++ b/paddle/phi/kernels/cpu/uniform_inplace_kernel.cc @@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/uniform_random_inplace_kernel.h" +#include "paddle/phi/kernels/uniform_inplace_kernel.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { template -void UniformRandomInplaceKernel(const Context& ctx, - const DenseTensor& x, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* out) { +void UniformInplaceKernel(const Context& ctx, + const DenseTensor& x, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out) { T* data = ctx.template Alloc(out); int64_t size = out->numel(); std::uniform_real_distribution dist(static_cast(min), @@ -46,9 +46,9 @@ void UniformRandomInplaceKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(uniform_random_inplace, +PD_REGISTER_KERNEL(uniform_inplace, CPU, ALL_LAYOUT, - phi::UniformRandomInplaceKernel, + phi::UniformInplaceKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_kernel.cc similarity index 76% rename from paddle/phi/kernels/cpu/uniform_random_kernel.cc rename to paddle/phi/kernels/cpu/uniform_kernel.cc index a4e66a8f64536..1b1503473d968 100644 --- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc +++ b/paddle/phi/kernels/cpu/uniform_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/kernels/uniform_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/uniform_real_distribution.h" @@ -20,16 +20,16 @@ namespace phi { template -void UniformRandomRawKernel(const Context &dev_ctx, - const IntArray &shape, - DataType dtype, - const Scalar &min, - const Scalar &max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor *out) { +void UniformRawKernel(const Context &dev_ctx, + const IntArray &shape, + DataType dtype, + const Scalar &min, + const Scalar &max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor *out) { out->Resize(phi::make_ddim(shape.GetData())); T *data = dev_ctx.template Alloc(out); auto size = out->numel(); @@ -63,10 +63,10 @@ void UniformRandomRawKernel(const Context &dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(uniform_random_raw, +PD_REGISTER_KERNEL(uniform_raw, CPU, ALL_LAYOUT, - phi::UniformRandomRawKernel, + phi::UniformRawKernel, float, double, phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/yolov3_loss_functor.h b/paddle/phi/kernels/cpu/yolo_loss_functor.h similarity index 100% rename from paddle/phi/kernels/cpu/yolov3_loss_functor.h rename to paddle/phi/kernels/cpu/yolo_loss_functor.h diff --git a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc similarity index 85% rename from paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc rename to paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc index bc3beae8b0550..647a093472a0d 100644 --- a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/yolo_loss_grad_kernel.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h" +#include "paddle/phi/kernels/yolo_loss_grad_kernel.h" #include #include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/yolov3_loss_functor.h" +#include "paddle/phi/kernels/cpu/yolo_loss_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -117,25 +117,25 @@ static inline void CalcObjnessLossGrad(T* input_grad, } template -void Yolov3LossGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& gt_box, - const DenseTensor& gt_label, - const paddle::optional& gt_score, - const DenseTensor& objectness_mask, - const DenseTensor& gt_match_mask, - const DenseTensor& loss_grad, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_y, - DenseTensor* x_grad, - DenseTensor* gt_box_grad, - DenseTensor* gt_label_grad, - DenseTensor* gt_score_grad) { +void YoloLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& gt_box, + const DenseTensor& gt_label, + const paddle::optional& gt_score, + const DenseTensor& objectness_mask, + const DenseTensor& gt_match_mask, + const DenseTensor& loss_grad, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_y, + DenseTensor* x_grad, + DenseTensor* gt_box_grad, + DenseTensor* gt_label_grad, + DenseTensor* gt_score_grad) { auto* input = &x; auto input_grad = x_grad; auto* objness_mask = &objectness_mask; @@ -237,9 +237,5 @@ void Yolov3LossGradKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(yolov3_loss_grad, - CPU, - ALL_LAYOUT, - phi::Yolov3LossGradKernel, - float, - double) {} +PD_REGISTER_KERNEL( + yolo_loss_grad, CPU, ALL_LAYOUT, phi::YoloLossGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc similarity index 93% rename from paddle/phi/kernels/cpu/yolov3_loss_kernel.cc rename to paddle/phi/kernels/cpu/yolo_loss_kernel.cc index 75b2e3c5c4a0e..b32d7ee5962c3 100644 --- a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/yolo_loss_kernel.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/yolov3_loss_kernel.h" +#include "paddle/phi/kernels/yolo_loss_kernel.h" #include #include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/yolov3_loss_functor.h" +#include "paddle/phi/kernels/cpu/yolo_loss_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -178,21 +178,21 @@ static void inline GtValid(bool* valid, } template -void Yolov3LossKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& gt_box, - const DenseTensor& gt_label, - const paddle::optional& gt_score, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_y, - DenseTensor* loss, - DenseTensor* objectness_mask, - DenseTensor* gt_match_mask) { +void YoloLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& gt_box, + const DenseTensor& gt_label, + const paddle::optional& gt_score, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_y, + DenseTensor* loss, + DenseTensor* objectness_mask, + DenseTensor* gt_match_mask) { auto* input = &x; auto objness_mask = objectness_mask; float scale = scale_x_y; @@ -371,4 +371,4 @@ void Yolov3LossKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - yolov3_loss, CPU, ALL_LAYOUT, phi::Yolov3LossKernel, float, double) {} + yolo_loss, CPU, ALL_LAYOUT, phi::YoloLossKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index adb0ca09d8938..0e59dbe9df7fa 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu +++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -23,7 +23,7 @@ #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/impl/lstsq_kernel_impl.h" #include "paddle/phi/kernels/impl/qr_kernel_impl.h" -#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_kernel_impl.h" #include "paddle/phi/kernels/lstsq_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -110,7 +110,7 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor* res_r = new DenseTensor(); res_r->Resize(phi::make_ddim({batch_count, min_mn, min_mn})); dev_ctx.template Alloc(res_r); - phi::TrilTriuKernel(dev_ctx, slice_r, 0, false, res_r); + phi::TrilKernel(dev_ctx, slice_r, 0, false, res_r); DenseTensor trans_y = phi::TransposeLast2Dim(dev_ctx, tmp_y); DenseTensor slice_y = @@ -135,7 +135,7 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor* res_r = new DenseTensor(); res_r->Resize(phi::make_ddim({batch_count, min_mn, min_mn})); dev_ctx.template Alloc(res_r); - phi::TrilTriuKernel(dev_ctx, slice_r, 0, false, res_r); + phi::TrilKernel(dev_ctx, slice_r, 0, false, res_r); phi::TriangularSolveKernel( dev_ctx, *res_r, *new_y, true, true, false, solution); diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu similarity index 90% rename from paddle/phi/kernels/gpu/where_index_kernel.cu rename to paddle/phi/kernels/gpu/nonzero_kernel.cu index c16859c52b22a..11139c7d65ded 100644 --- a/paddle/phi/kernels/gpu/where_index_kernel.cu +++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu @@ -25,7 +25,7 @@ namespace cub = hipcub; #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/select_impl.cu.h" -#include "paddle/phi/kernels/where_index_kernel.h" +#include "paddle/phi/kernels/nonzero_kernel.h" namespace phi { template @@ -62,9 +62,9 @@ struct IndexFunctor { }; template -void WhereIndexKernel(const Context &dev_ctx, - const DenseTensor &condition, - DenseTensor *out) { +void NonZeroKernel(const Context &dev_ctx, + const DenseTensor &condition, + DenseTensor *out) { DenseTensor in_data; auto dims = condition.dims(); using Functor = IndexFunctor; @@ -74,10 +74,10 @@ void WhereIndexKernel(const Context &dev_ctx, } } // namespace phi -PD_REGISTER_KERNEL(where_index, +PD_REGISTER_KERNEL(nonzero, GPU, ALL_LAYOUT, - phi::WhereIndexKernel, + phi::NonZeroKernel, int64_t, int, int16_t, diff --git a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu b/paddle/phi/kernels/gpu/prod_grad_kernel.cu similarity index 84% rename from paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu rename to paddle/phi/kernels/gpu/prod_grad_kernel.cu index 08444cf95d6c6..301cc46b0b730 100644 --- a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/prod_grad_kernel.cu @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_grad_kernel.h" +#include "paddle/phi/kernels/prod_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/prod_grad_kernel_impl.h" PD_REGISTER_KERNEL(prod_grad, GPU, ALL_LAYOUT, - phi::ReduceProdGradKernel, + phi::ProdGradKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu index 99752ac486d6e..697cf952c1cec 100644 --- a/paddle/phi/kernels/gpu/qr_kernel.cu +++ b/paddle/phi/kernels/gpu/qr_kernel.cu @@ -31,7 +31,7 @@ #include "paddle/phi/kernels/qr_kernel.h" #include "paddle/phi/kernels/slice_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" -#include "paddle/phi/kernels/tril_triu_kernel.h" +#include "paddle/phi/kernels/tril_kernel.h" namespace phi { @@ -103,12 +103,12 @@ void QrKernel(const Context& ctx, auto trans_qr = TransposeLast2Dim(ctx, qr); auto sliced_qr = SliceKernel( ctx, trans_qr, {trans_qr.dims().size() - 2}, {0}, {min_mn}, {1}, {}); - auto tmp_r = TrilTriu(ctx, sliced_qr, 0, false); + auto tmp_r = Tril(ctx, sliced_qr, 0, false); // Transpose 'tmp_r' to retore the original row-major order phi::Copy(ctx, tmp_r, r->place(), false, r); } else { auto trans_qr = TransposeLast2Dim(ctx, qr); - auto tmp_r = TrilTriu(ctx, trans_qr, 0, false); + auto tmp_r = Tril(ctx, trans_qr, 0, false); // Transpose 'tmp_r' to retore the original row-major order phi::Copy(ctx, tmp_r, r->place(), false, r); } diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_grad_kernel.cu similarity index 88% rename from paddle/phi/kernels/gpu/tril_triu_kernel.cu rename to paddle/phi/kernels/gpu/tril_grad_kernel.cu index 65dcca70584b8..5bda0e54b33a6 100644 --- a/paddle/phi/kernels/gpu/tril_triu_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_grad_kernel.cu @@ -14,12 +14,12 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_grad_kernel_impl.h" -PD_REGISTER_KERNEL(tril_triu, +PD_REGISTER_KERNEL(tril_grad, GPU, ALL_LAYOUT, - phi::TrilTriuKernel, + phi::TrilGradKernel, bool, float, double, diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_kernel.cu similarity index 86% rename from paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu rename to paddle/phi/kernels/gpu/tril_kernel.cu index 3271b38ae8726..c50b7c513fd07 100644 --- a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_kernel.cu @@ -14,12 +14,12 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_kernel_impl.h" -PD_REGISTER_KERNEL(tril_triu_grad, +PD_REGISTER_KERNEL(tril, GPU, ALL_LAYOUT, - phi::TrilTriuGradKernel, + phi::TrilKernel, bool, float, double, diff --git a/paddle/phi/kernels/gpu/uniform_random_inplace_grad_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu similarity index 61% rename from paddle/phi/kernels/gpu/uniform_random_inplace_grad_kernel.cu rename to paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu index 6c6f525a8d96b..61efe8807643d 100644 --- a/paddle/phi/kernels/gpu/uniform_random_inplace_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/uniform_random_inplace_grad_kernel.h" +#include "paddle/phi/kernels/uniform_inplace_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" @@ -20,15 +20,15 @@ limitations under the License. */ namespace phi { template -void UniformRandomInplaceGradKernel(const Context& ctx, - const DenseTensor& out_grad, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* x_grad) { +void UniformInplaceGradKernel(const Context& ctx, + const DenseTensor& out_grad, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* x_grad) { auto dims = vectorize(x_grad->dims()); float value = static_cast(0.0f); phi::FullKernel(ctx, dims, value, phi::DataType::UNDEFINED, x_grad); @@ -36,9 +36,9 @@ void UniformRandomInplaceGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(uniform_random_inplace_grad, +PD_REGISTER_KERNEL(uniform_inplace_grad, GPU, ALL_LAYOUT, - phi::UniformRandomInplaceGradKernel, + phi::UniformInplaceGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/uniform_random_inplace_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu similarity index 79% rename from paddle/phi/kernels/gpu/uniform_random_inplace_kernel.cu rename to paddle/phi/kernels/gpu/uniform_inplace_kernel.cu index d96f582b19185..29bc2f4de5b86 100644 --- a/paddle/phi/kernels/gpu/uniform_random_inplace_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/uniform_random_inplace_kernel.h" +#include "paddle/phi/kernels/uniform_inplace_kernel.h" #include @@ -54,15 +54,15 @@ struct UniformGenerator { }; template -void UniformRandomInplaceKernel(const Context& ctx, - const DenseTensor& x, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* out) { +void UniformInplaceKernel(const Context& ctx, + const DenseTensor& x, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out) { ctx.template Alloc(out); if (seed == 0) { // Use global Generator seed @@ -80,9 +80,9 @@ void UniformRandomInplaceKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(uniform_random_inplace, +PD_REGISTER_KERNEL(uniform_inplace, GPU, ALL_LAYOUT, - phi::UniformRandomInplaceKernel, + phi::UniformInplaceKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_kernel.cu similarity index 81% rename from paddle/phi/kernels/gpu/uniform_random_kernel.cu rename to paddle/phi/kernels/gpu/uniform_kernel.cu index 458239814b65e..277dadabea6d2 100644 --- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/kernels/uniform_kernel.h" #include @@ -54,16 +54,16 @@ struct UniformGenerator { }; template -void UniformRandomRawKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* out) { +void UniformRawKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); dev_ctx.template Alloc(out); if (seed == 0) { @@ -86,10 +86,10 @@ void UniformRandomRawKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(uniform_random_raw, +PD_REGISTER_KERNEL(uniform_raw, GPU, ALL_LAYOUT, - phi::UniformRandomRawKernel, + phi::UniformRawKernel, float, double, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h deleted file mode 100644 index c0da8faadd592..0000000000000 --- a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void HierarchicalSigmoidGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - const DenseTensor& pre_out, - const DenseTensor& out_grad, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* x_grad, - DenseTensor* w_grad, - DenseTensor* bias_grad); - -} // namespace phi diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h deleted file mode 100644 index e32306b645a6f..0000000000000 --- a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void HierarchicalSigmoidKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* out, - DenseTensor* pre_out, - DenseTensor* w_out); - -} // namespace phi diff --git a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h new file mode 100644 index 0000000000000..e31d429107990 --- /dev/null +++ b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HSigmoidLossGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/hsigmoid_loss_kernel.h b/paddle/phi/kernels/hsigmoid_loss_kernel.h new file mode 100644 index 0000000000000..c8fb3ca77f3f9 --- /dev/null +++ b/paddle/phi/kernels/hsigmoid_loss_kernel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HSigmoidLossKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out); + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/prod_grad_kernel_impl.h similarity index 69% rename from paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h rename to paddle/phi/kernels/impl/prod_grad_kernel_impl.h index a6f92543cc9c6..13f517c072c15 100644 --- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/prod_grad_kernel_impl.h @@ -17,19 +17,19 @@ #include "paddle/phi/common/int_array.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/reduce_grad.h" -#include "paddle/phi/kernels/reduce_prod_grad_kernel.h" +#include "paddle/phi/kernels/prod_grad_kernel.h" namespace phi { template -void ReduceProdGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out, - const DenseTensor& out_grad, - const IntArray& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad) { +void ProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const IntArray& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { ReduceGradKernel( dev_ctx, x, out, out_grad, dims.GetData(), keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h index 5c04d9bb90cfe..5ad59f757aa22 100644 --- a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h @@ -29,7 +29,7 @@ #include "paddle/phi/kernels/slice_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" -#include "paddle/phi/kernels/tril_triu_kernel.h" +#include "paddle/phi/kernels/tril_kernel.h" namespace phi { @@ -116,8 +116,8 @@ void QrGradKernel(const Context& ctx, DenseTensor M_tmp1 = Subtract(ctx, R_term, Q_term); // Compute M = (tril(M) + tril(M).mH()) * 0.5 Identity - DenseTensor M_tril_0 = TrilTriu(ctx, M_tmp1, 0, true); - DenseTensor M_tril_1 = TrilTriu(ctx, M_tmp1, -1, true); + DenseTensor M_tril_0 = Tril(ctx, M_tmp1, 0, true); + DenseTensor M_tril_1 = Tril(ctx, M_tmp1, -1, true); DenseTensor M = Add( ctx, M_tril_0, TransposeLast2Dim(ctx, M_tril_1)); diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_grad_kernel_impl.h similarity index 82% rename from paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h rename to paddle/phi/kernels/impl/tril_grad_kernel_impl.h index 91dbde04aca1f..3f72d34a957bd 100644 --- a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/tril_grad_kernel_impl.h @@ -16,16 +16,16 @@ #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/tril_triu_grad_kernel.h" +#include "paddle/phi/kernels/tril_grad_kernel.h" namespace phi { template -void TrilTriuGradKernel(const Context& ctx, - const DenseTensor& out_grad, - int diagonal, - bool lower, - DenseTensor* x_grad) { +void TrilGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { const auto* dout_data = out_grad.data(); auto* dx_data = ctx.template Alloc(x_grad); diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_kernel_impl.h similarity index 83% rename from paddle/phi/kernels/impl/tril_triu_kernel_impl.h rename to paddle/phi/kernels/impl/tril_kernel_impl.h index 24c032893c3fb..8e93e87fbc4d8 100644 --- a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h +++ b/paddle/phi/kernels/impl/tril_kernel_impl.h @@ -16,16 +16,16 @@ #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/tril_triu_kernel.h" +#include "paddle/phi/kernels/tril_kernel.h" namespace phi { template -void TrilTriuKernel(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower, - DenseTensor* out) { +void TrilKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { const auto* x_data = x.data(); auto* out_data = ctx.template Alloc(out); diff --git a/paddle/phi/kernels/kps/reduce_prod_kernel.cu b/paddle/phi/kernels/kps/prod_kernel.cu similarity index 96% rename from paddle/phi/kernels/kps/reduce_prod_kernel.cu rename to paddle/phi/kernels/kps/prod_kernel.cu index f5b52937e36fe..326a351f6dabb 100644 --- a/paddle/phi/kernels/kps/reduce_prod_kernel.cu +++ b/paddle/phi/kernels/kps/prod_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/kernels/prod_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" diff --git a/paddle/phi/kernels/where_index_kernel.h b/paddle/phi/kernels/nonzero_kernel.h similarity index 84% rename from paddle/phi/kernels/where_index_kernel.h rename to paddle/phi/kernels/nonzero_kernel.h index 68b094637c8d5..757ddd7ac3d8d 100644 --- a/paddle/phi/kernels/where_index_kernel.h +++ b/paddle/phi/kernels/nonzero_kernel.h @@ -19,8 +19,8 @@ namespace phi { template -void WhereIndexKernel(const Context& dev_ctx, - const DenseTensor& condition, - DenseTensor* out); +void NonZeroKernel(const Context& dev_ctx, + const DenseTensor& condition, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/reduce_prod_grad_kernel.h b/paddle/phi/kernels/prod_grad_kernel.h similarity index 68% rename from paddle/phi/kernels/reduce_prod_grad_kernel.h rename to paddle/phi/kernels/prod_grad_kernel.h index fb773f167f90b..5b6f8807e94b3 100644 --- a/paddle/phi/kernels/reduce_prod_grad_kernel.h +++ b/paddle/phi/kernels/prod_grad_kernel.h @@ -20,12 +20,12 @@ namespace phi { template -void ReduceProdGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out, - const DenseTensor& out_grad, - const IntArray& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* x_grad); +void ProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const IntArray& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/reduce_prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc similarity index 96% rename from paddle/phi/kernels/reduce_prod_kernel.cc rename to paddle/phi/kernels/prod_kernel.cc index 538c5a5175aa7..532b6fdaa141f 100644 --- a/paddle/phi/kernels/reduce_prod_kernel.cc +++ b/paddle/phi/kernels/prod_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/kernels/prod_kernel.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/prod_kernel.h similarity index 100% rename from paddle/phi/kernels/reduce_prod_kernel.h rename to paddle/phi/kernels/prod_kernel.h diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc deleted file mode 100644 index 616786d210df7..0000000000000 --- a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h" - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" - -namespace phi { -namespace sr { - -static std::vector PathToRows(const DenseTensor& path) { - std::set rows; - const int64_t* paths = path.data(); - for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = paths[i]; - if (row < 0) { - continue; - } - rows.emplace(row); - } - return std::vector(rows.begin(), rows.end()); -} - -template -void HierarchicalSigmoidGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - const DenseTensor& pre_out, - const DenseTensor& out_grad, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* x_grad, - SelectedRows* w_grad, - DenseTensor* bias_grad) { - PADDLE_ENFORCE_NOT_NULL( - path.get_ptr(), - errors::NotFound("Custom tree must be set for sparse mode!")); - paddle::framework::Vector real_rows = PathToRows(*path); - w_grad->set_rows(real_rows); - // Build a map of id -> row_index to speed up finding the index of one id - w_grad->set_height(w.dims()[0]); - auto* w_grad_value = w_grad->mutable_value(); - phi::DDim temp_dim(w.dims()); - temp_dim[0] = real_rows.size(); - w_grad_value->Resize(temp_dim); - phi::HierarchicalSigmoidGradKernelImpl(ctx, - x, - w, - label, - path, - code, - bias, - pre_out, - out_grad, - num_classes, - remote_prefetch, - trainer_id, - height_sections, - epmap, - table_names, - is_sparse, - x_grad, - w_grad_value, - bias_grad, - w_grad); -} - -} // namespace sr -} // namespace phi - -PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr, - CPU, - ALL_LAYOUT, - phi::sr::HierarchicalSigmoidGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h deleted file mode 100644 index aca355f515c44..0000000000000 --- a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" - -namespace phi { -namespace sr { - -template -void HierarchicalSigmoidGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& w, - const DenseTensor& label, - const paddle::optional& path, - const paddle::optional& code, - const paddle::optional& bias, - const DenseTensor& pre_out, - const DenseTensor& out_grad, - int num_classes, - bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, - bool is_sparse, - DenseTensor* x_grad, - SelectedRows* w_grad, - DenseTensor* bias_grad); - -} // namespace sr -} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc new file mode 100644 index 0000000000000..1fedcb14f5426 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h" + +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hsigmoid_loss_grad.h" + +namespace phi { +namespace sr { + +static std::vector PathToRows(const DenseTensor& path) { + std::set rows; + const int64_t* paths = path.data(); + for (int64_t i = 0; i < path.numel(); ++i) { + int64_t row = paths[i]; + if (row < 0) { + continue; + } + rows.emplace(row); + } + return std::vector(rows.begin(), rows.end()); +} + +template +void HSigmoidLossGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad) { + PADDLE_ENFORCE_NOT_NULL( + path.get_ptr(), + errors::NotFound("Custom tree must be set for sparse mode!")); + paddle::framework::Vector real_rows = PathToRows(*path); + w_grad->set_rows(real_rows); + // Build a map of id -> row_index to speed up finding the index of one id + w_grad->set_height(w.dims()[0]); + auto* w_grad_value = w_grad->mutable_value(); + phi::DDim temp_dim(w.dims()); + temp_dim[0] = real_rows.size(); + w_grad_value->Resize(temp_dim); + phi::HSigmoidLossGradKernelImpl(ctx, + x, + w, + label, + path, + code, + bias, + pre_out, + out_grad, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad_value, + bias_grad, + w_grad); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(hsigmoid_loss_grad_sr, + CPU, + ALL_LAYOUT, + phi::sr::HSigmoidLossGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h new file mode 100644 index 0000000000000..fe4ffe24601ae --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void HSigmoidLossGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const paddle::optional& path, + const paddle::optional& code, + const paddle::optional& bias, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc new file mode 100644 index 0000000000000..73d00aa9a796e --- /dev/null +++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/selected_rows/uniform_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/uniform_kernel.h" + +namespace phi { +namespace sr { + +template +void UniformRawKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out) { + phi::UniformRawKernel(dev_ctx, + shape, + dtype, + min, + max, + seed, + diag_num, + diag_step, + diag_val, + out->mutable_value()); +} + +template +void UniformKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + SelectedRows* out) { + phi::UniformKernel( + dev_ctx, shape, dtype, min, max, seed, out->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(uniform_raw_sr, + CPU, + ALL_LAYOUT, + phi::sr::UniformRawKernel, + float, + double, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(uniform_sr, + CPU, + ALL_LAYOUT, + phi::sr::UniformKernel, + float, + double, + phi::dtype::bfloat16) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL( + uniform_raw_sr, GPU, ALL_LAYOUT, phi::sr::UniformRawKernel, float, double) { +} + +PD_REGISTER_KERNEL( + uniform_sr, GPU, ALL_LAYOUT, phi::sr::UniformKernel, float, double) {} +#endif + +#if defined(PADDLE_WITH_XPU) + +PD_REGISTER_KERNEL( + uniform_raw_sr, XPU, ALL_LAYOUT, phi::sr::UniformRawKernel, float) {} + +PD_REGISTER_KERNEL(uniform_sr, XPU, ALL_LAYOUT, phi::sr::UniformKernel, float) { +} +#endif diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_kernel.h similarity index 54% rename from paddle/phi/kernels/selected_rows/uniform_random_kernel.h rename to paddle/phi/kernels/selected_rows/uniform_kernel.h index 237b01532c7bd..dc50175db84b6 100644 --- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h +++ b/paddle/phi/kernels/selected_rows/uniform_kernel.h @@ -22,25 +22,25 @@ namespace phi { namespace sr { template -void UniformRandomRawKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - int diag_num, - int diag_step, - float diag_val, - SelectedRows* out); +void UniformRawKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out); template -void UniformRandomKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - SelectedRows* out); +void UniformKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + SelectedRows* out); } // namespace sr } // namespace phi diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc deleted file mode 100644 index d6037da45f69a..0000000000000 --- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/selected_rows/uniform_random_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/uniform_random_kernel.h" - -namespace phi { -namespace sr { - -template -void UniformRandomRawKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - int diag_num, - int diag_step, - float diag_val, - SelectedRows* out) { - phi::UniformRandomRawKernel(dev_ctx, - shape, - dtype, - min, - max, - seed, - diag_num, - diag_step, - diag_val, - out->mutable_value()); -} - -template -void UniformRandomKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - SelectedRows* out) { - phi::UniformRandomKernel( - dev_ctx, shape, dtype, min, max, seed, out->mutable_value()); -} - -} // namespace sr -} // namespace phi - -PD_REGISTER_KERNEL(uniform_random_raw_sr, - CPU, - ALL_LAYOUT, - phi::sr::UniformRandomRawKernel, - float, - double, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(uniform_random_sr, - CPU, - ALL_LAYOUT, - phi::sr::UniformRandomKernel, - float, - double, - phi::dtype::bfloat16) {} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -PD_REGISTER_KERNEL(uniform_random_raw_sr, - GPU, - ALL_LAYOUT, - phi::sr::UniformRandomRawKernel, - float, - double) {} - -PD_REGISTER_KERNEL(uniform_random_sr, - GPU, - ALL_LAYOUT, - phi::sr::UniformRandomKernel, - float, - double) {} -#endif - -#if defined(PADDLE_WITH_XPU) - -PD_REGISTER_KERNEL(uniform_random_raw_sr, - XPU, - ALL_LAYOUT, - phi::sr::UniformRandomRawKernel, - float) {} - -PD_REGISTER_KERNEL( - uniform_random_sr, XPU, ALL_LAYOUT, phi::sr::UniformRandomKernel, float) {} -#endif diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_grad_kernel.h similarity index 77% rename from paddle/phi/kernels/tril_triu_grad_kernel.h rename to paddle/phi/kernels/tril_grad_kernel.h index 10faf5c48d5bf..7fc5e77363c62 100644 --- a/paddle/phi/kernels/tril_triu_grad_kernel.h +++ b/paddle/phi/kernels/tril_grad_kernel.h @@ -19,10 +19,10 @@ namespace phi { template -void TrilTriuGradKernel(const Context& ctx, - const DenseTensor& out_grad, - int diagonal, - bool lower, - DenseTensor* x_grad); +void TrilGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_kernel.h similarity index 66% rename from paddle/phi/kernels/tril_triu_kernel.h rename to paddle/phi/kernels/tril_kernel.h index 8d4c44c5b35e2..52154c2b2bcb5 100644 --- a/paddle/phi/kernels/tril_triu_kernel.h +++ b/paddle/phi/kernels/tril_kernel.h @@ -20,21 +20,21 @@ namespace phi { template -void TrilTriuKernel(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower, - DenseTensor* out); +void TrilKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out); template -DenseTensor TrilTriu(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower) { +DenseTensor Tril(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); - TrilTriuInferMeta(x, diagonal, lower, &meta_out); - TrilTriuKernel(ctx, x, diagonal, lower, &dense_out); + TrilInferMeta(x, diagonal, lower, &meta_out); + TrilKernel(ctx, x, diagonal, lower, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/uniform_random_inplace_kernel.h b/paddle/phi/kernels/uniform_inplace_grad_kernel.h similarity index 63% rename from paddle/phi/kernels/uniform_random_inplace_kernel.h rename to paddle/phi/kernels/uniform_inplace_grad_kernel.h index 97a79375aff19..fd37f3c6f5d61 100644 --- a/paddle/phi/kernels/uniform_random_inplace_kernel.h +++ b/paddle/phi/kernels/uniform_inplace_grad_kernel.h @@ -19,14 +19,14 @@ limitations under the License. */ namespace phi { template -void UniformRandomInplaceKernel(const Context& ctx, - const DenseTensor& x, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* out); +void UniformInplaceGradKernel(const Context& ctx, + const DenseTensor& out_grad, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/uniform_random_inplace_grad_kernel.h b/paddle/phi/kernels/uniform_inplace_kernel.h similarity index 60% rename from paddle/phi/kernels/uniform_random_inplace_grad_kernel.h rename to paddle/phi/kernels/uniform_inplace_kernel.h index ae74fbe2fd78c..9bb17b4a33773 100644 --- a/paddle/phi/kernels/uniform_random_inplace_grad_kernel.h +++ b/paddle/phi/kernels/uniform_inplace_kernel.h @@ -19,14 +19,14 @@ limitations under the License. */ namespace phi { template -void UniformRandomInplaceGradKernel(const Context& ctx, - const DenseTensor& out_grad, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* x_grad); +void UniformInplaceKernel(const Context& ctx, + const DenseTensor& x, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/uniform_random_kernel.cc b/paddle/phi/kernels/uniform_kernel.cc similarity index 68% rename from paddle/phi/kernels/uniform_random_kernel.cc rename to paddle/phi/kernels/uniform_kernel.cc index 6669438cc3b7b..3744fc49d77a0 100644 --- a/paddle/phi/kernels/uniform_random_kernel.cc +++ b/paddle/phi/kernels/uniform_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/kernels/uniform_kernel.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" @@ -29,38 +29,36 @@ namespace phi { template -void UniformRandomKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - DenseTensor* out) { - UniformRandomRawKernel( - dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); +void UniformKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + DenseTensor* out) { + UniformRawKernel(dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); } } // namespace phi -PD_REGISTER_KERNEL(uniform_random, +PD_REGISTER_KERNEL(uniform, CPU, ALL_LAYOUT, - phi::UniformRandomKernel, + phi::UniformKernel, float, double, phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(uniform_random, +PD_REGISTER_KERNEL(uniform, GPU, ALL_LAYOUT, - phi::UniformRandomKernel, + phi::UniformKernel, float, double, phi::dtype::float16) {} #endif #ifdef PADDLE_WITH_XPU -PD_REGISTER_KERNEL( - uniform_random, XPU, ALL_LAYOUT, phi::UniformRandomKernel, float) {} +PD_REGISTER_KERNEL(uniform, XPU, ALL_LAYOUT, phi::UniformKernel, float) {} #endif diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_kernel.h similarity index 54% rename from paddle/phi/kernels/uniform_random_kernel.h rename to paddle/phi/kernels/uniform_kernel.h index 1395a4663b914..ef19c20b93284 100644 --- a/paddle/phi/kernels/uniform_random_kernel.h +++ b/paddle/phi/kernels/uniform_kernel.h @@ -22,24 +22,24 @@ namespace phi { template -void UniformRandomRawKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor* out); +void UniformRawKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out); template -void UniformRandomKernel(const Context& dev_ctx, - const IntArray& shape, - DataType dtype, - const Scalar& min, - const Scalar& max, - int seed, - DenseTensor* out); +void UniformKernel(const Context& dev_ctx, + const IntArray& shape, + DataType dtype, + const Scalar& min, + const Scalar& max, + int seed, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/xpu/where_index_kernel.cc b/paddle/phi/kernels/xpu/nonzero_kernel.cc similarity index 89% rename from paddle/phi/kernels/xpu/where_index_kernel.cc rename to paddle/phi/kernels/xpu/nonzero_kernel.cc index f6653e57f6ead..cf936f659f3fc 100644 --- a/paddle/phi/kernels/xpu/where_index_kernel.cc +++ b/paddle/phi/kernels/xpu/nonzero_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/where_index_kernel.h" +#include "paddle/phi/kernels/nonzero_kernel.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" @@ -22,9 +22,9 @@ namespace phi { template -void WhereIndexKernel(const Context& dev_ctx, - const DenseTensor& condition, - DenseTensor* out) { +void NonZeroKernel(const Context& dev_ctx, + const DenseTensor& condition, + DenseTensor* out) { const T* cond_data = condition.data(); auto numel = condition.numel(); auto dims = condition.dims(); @@ -69,4 +69,4 @@ void WhereIndexKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - where_index, XPU, ALL_LAYOUT, phi::WhereIndexKernel, int, bool, float) {} + nonzero, XPU, ALL_LAYOUT, phi::NonZeroKernel, int, bool, float) {} diff --git a/paddle/phi/kernels/xpu/reduce_prod_kernel.cc b/paddle/phi/kernels/xpu/prod_kernel.cc similarity index 96% rename from paddle/phi/kernels/xpu/reduce_prod_kernel.cc rename to paddle/phi/kernels/xpu/prod_kernel.cc index c82dd1b5f667d..7be48a8bab774 100644 --- a/paddle/phi/kernels/xpu/reduce_prod_kernel.cc +++ b/paddle/phi/kernels/xpu/prod_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/kernels/prod_kernel.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/xpu_context.h" diff --git a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/xpu/tril_grad_kernel.cc similarity index 81% rename from paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc rename to paddle/phi/kernels/xpu/tril_grad_kernel.cc index 964e9c6174235..af8dfdd8c0bad 100644 --- a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/tril_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/tril_triu_grad_kernel.h" +#include "paddle/phi/kernels/tril_grad_kernel.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,11 +20,11 @@ namespace phi { template -void TrilTriuGradKernel(const Context& ctx, - const DenseTensor& out_grad, - int diagonal, - bool lower, - DenseTensor* x_grad) { +void TrilGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { using XPUType = typename XPUTypeTrait::Type; ctx.template Alloc(x_grad); auto dy_shape = vectorize(out_grad.dims()); @@ -49,4 +49,4 @@ void TrilTriuGradKernel(const Context& ctx, } // namespace phi PD_REGISTER_KERNEL( - tril_triu_grad, XPU, ALL_LAYOUT, phi::TrilTriuGradKernel, int, float) {} + tril_grad, XPU, ALL_LAYOUT, phi::TrilGradKernel, int, float) {} diff --git a/paddle/phi/kernels/xpu/tril_triu_kernel.cc b/paddle/phi/kernels/xpu/tril_kernel.cc similarity index 82% rename from paddle/phi/kernels/xpu/tril_triu_kernel.cc rename to paddle/phi/kernels/xpu/tril_kernel.cc index 3d9ae98a23857..4b4cf579c26c6 100644 --- a/paddle/phi/kernels/xpu/tril_triu_kernel.cc +++ b/paddle/phi/kernels/xpu/tril_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/tril_triu_kernel.h" +#include "paddle/phi/kernels/tril_kernel.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,11 +20,11 @@ namespace phi { template -void TrilTriuKernel(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower, - DenseTensor* out) { +void TrilKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; ctx.template Alloc(out); auto xshape = vectorize(x.dims()); @@ -48,5 +48,4 @@ void TrilTriuKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL( - tril_triu, XPU, ALL_LAYOUT, phi::TrilTriuKernel, int, float) {} +PD_REGISTER_KERNEL(tril, XPU, ALL_LAYOUT, phi::TrilKernel, int, float) {} diff --git a/paddle/phi/kernels/xpu/uniform_random_kernel.cc b/paddle/phi/kernels/xpu/uniform_kernel.cc similarity index 77% rename from paddle/phi/kernels/xpu/uniform_random_kernel.cc rename to paddle/phi/kernels/xpu/uniform_kernel.cc index 48384164e7668..f4732939b6dbf 100644 --- a/paddle/phi/kernels/xpu/uniform_random_kernel.cc +++ b/paddle/phi/kernels/xpu/uniform_kernel.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/kernels/uniform_kernel.h" #include @@ -24,16 +24,16 @@ limitations under the License. */ namespace phi { template -void UniformRandomRawKernel(const Context &dev_ctx, - const IntArray &shape, - DataType dtype, - const Scalar &min, - const Scalar &max, - int seed, - int diag_num, - int diag_step, - float diag_val, - DenseTensor *out) { +void UniformRawKernel(const Context &dev_ctx, + const IntArray &shape, + DataType dtype, + const Scalar &min, + const Scalar &max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor *out) { out->Resize(phi::make_ddim(shape.GetData())); T *data = dev_ctx.template Alloc(out); int64_t size = out->numel(); @@ -76,5 +76,5 @@ void UniformRandomRawKernel(const Context &dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - uniform_random_raw, XPU, ALL_LAYOUT, phi::UniformRandomRawKernel, float) {} +PD_REGISTER_KERNEL(uniform_raw, XPU, ALL_LAYOUT, phi::UniformRawKernel, float) { +} diff --git a/paddle/phi/kernels/yolo_loss_grad_kernel.h b/paddle/phi/kernels/yolo_loss_grad_kernel.h new file mode 100644 index 0000000000000..905c1ab4efb0b --- /dev/null +++ b/paddle/phi/kernels/yolo_loss_grad_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void YoloLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& gt_box, + const DenseTensor& gt_label, + const paddle::optional& gt_score, + const DenseTensor& objectness_mask, + const DenseTensor& gt_match_mask, + const DenseTensor& loss_grad, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_Y, + DenseTensor* x_grad, + DenseTensor* gt_box_grad, + DenseTensor* gt_label_grad, + DenseTensor* gt_score_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/yolo_loss_kernel.h b/paddle/phi/kernels/yolo_loss_kernel.h new file mode 100644 index 0000000000000..17735fe756f4d --- /dev/null +++ b/paddle/phi/kernels/yolo_loss_kernel.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void YoloLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& gt_box, + const DenseTensor& gt_label, + const paddle::optional& gt_score, + const std::vector& anchors, + const std::vector& anchor_mask, + int class_num, + float ignore_thresh, + int downsample_ratio, + bool use_label_smooth, + float scale_x_Y, + DenseTensor* loss, + DenseTensor* objectness_mask, + DenseTensor* gt_match_mask); + +} // namespace phi diff --git a/paddle/phi/kernels/yolov3_loss_grad_kernel.h b/paddle/phi/kernels/yolov3_loss_grad_kernel.h deleted file mode 100644 index 4d0be5bebb6f9..0000000000000 --- a/paddle/phi/kernels/yolov3_loss_grad_kernel.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void Yolov3LossGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& gt_box, - const DenseTensor& gt_label, - const paddle::optional& gt_score, - const DenseTensor& objectness_mask, - const DenseTensor& gt_match_mask, - const DenseTensor& loss_grad, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_Y, - DenseTensor* x_grad, - DenseTensor* gt_box_grad, - DenseTensor* gt_label_grad, - DenseTensor* gt_score_grad); - -} // namespace phi diff --git a/paddle/phi/kernels/yolov3_loss_kernel.h b/paddle/phi/kernels/yolov3_loss_kernel.h deleted file mode 100644 index 3dabe5ce820ee..0000000000000 --- a/paddle/phi/kernels/yolov3_loss_kernel.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void Yolov3LossKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& gt_box, - const DenseTensor& gt_label, - const paddle::optional& gt_score, - const std::vector& anchors, - const std::vector& anchor_mask, - int class_num, - float ignore_thresh, - int downsample_ratio, - bool use_label_smooth, - float scale_x_Y, - DenseTensor* loss, - DenseTensor* objectness_mask, - DenseTensor* gt_match_mask); - -} // namespace phi diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc index 5393439901b91..c4e04e5d40b02 100644 --- a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc +++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc @@ -18,7 +18,7 @@ namespace phi { KernelSignature HierarchicalSigmoidOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("hierarchical_sigmoid", + return KernelSignature("hsigmoid_loss", {"X", "W", "Label", "PathTable", "PathCode", "Bias"}, {"num_classes", "remote_prefetch", @@ -33,7 +33,7 @@ KernelSignature HierarchicalSigmoidOpArgumentMapping( KernelSignature HierarchicalSigmoidGradOpArgumentMapping( const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorOutput("W@GRAD")) { - return KernelSignature("hierarchical_sigmoid_grad", + return KernelSignature("hsigmoid_loss_grad", {"X", "W", "Label", @@ -51,7 +51,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping( "is_sparse"}, {"X@GRAD", "W@GRAD", "Bias@GRAD"}); } else if (ctx.IsSelectedRowsOutput("W@GRAD")) { - return KernelSignature("hierarchical_sigmoid_grad_sr", + return KernelSignature("hsigmoid_loss_grad_sr", {"X", "W", "Label", @@ -75,6 +75,9 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping( } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(hierarchical_sigmoid, hsigmoid_loss); +PD_REGISTER_BASE_KERNEL_NAME(hierarchical_sigmoid_grad, hsigmoid_loss_grad); + PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid, phi::HierarchicalSigmoidOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad, diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc index 3c5fa15b41cae..3cf022c60e3da 100644 --- a/paddle/phi/ops/compat/tril_triu_sig.cc +++ b/paddle/phi/ops/compat/tril_triu_sig.cc @@ -17,16 +17,19 @@ limitations under the License. */ namespace phi { KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); + return KernelSignature("tril", {"X"}, {"diagonal", "lower"}, {"Out"}); } KernelSignature TrilTriuGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "tril_triu_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); + "tril_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(tril_triu, tril); +PD_REGISTER_BASE_KERNEL_NAME(tril_triu_grad, tril_grad); + PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/uniform_random_inplace_sig.cc b/paddle/phi/ops/compat/uniform_random_inplace_sig.cc index afdc0d5f3b3e3..ae955e9ca19bc 100644 --- a/paddle/phi/ops/compat/uniform_random_inplace_sig.cc +++ b/paddle/phi/ops/compat/uniform_random_inplace_sig.cc @@ -18,7 +18,7 @@ namespace phi { KernelSignature UniformRandomInplaceOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "uniform_random_inplace", + "uniform_inplace", {"X"}, {"min", "max", "seed", "diag_num", "diag_step", "diag_val"}, {"Out"}); @@ -27,7 +27,7 @@ KernelSignature UniformRandomInplaceOpArgumentMapping( KernelSignature UniformRandomInplaceGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "uniform_random_inplace_grad", + "uniform_inplace_grad", {"Out@GRAD"}, {"min", "max", "seed", "diag_num", "diag_step", "diag_val"}, {"X@GRAD"}); @@ -35,6 +35,8 @@ KernelSignature UniformRandomInplaceGradOpArgumentMapping( } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(uniform_random_inplace, uniform_inplace); + PD_REGISTER_ARG_MAPPING_FN(uniform_random_inplace, phi::UniformRandomInplaceOpArgumentMapping); diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/phi/ops/compat/uniform_random_sig.cc index d06d4026f4f5f..a6d0b185c6456 100644 --- a/paddle/phi/ops/compat/uniform_random_sig.cc +++ b/paddle/phi/ops/compat/uniform_random_sig.cc @@ -22,7 +22,7 @@ KernelSignature UniformRandomOpArgumentMapping( if (ctx.IsDenseTensorOutput("Out")) { if (diag_num) { if (ctx.InputSize("ShapeTensorList") > 0) { - return KernelSignature("uniform_random_raw", + return KernelSignature("uniform_raw", {}, {"ShapeTensorList", "dtype", @@ -37,7 +37,7 @@ KernelSignature UniformRandomOpArgumentMapping( const auto& shape = paddle::any_cast>(ctx.Attr("shape")); if (ctx.HasInput("ShapeTensor") && shape.empty()) { - return KernelSignature("uniform_random_raw", + return KernelSignature("uniform_raw", {}, {"ShapeTensor", "dtype", @@ -49,7 +49,7 @@ KernelSignature UniformRandomOpArgumentMapping( "diag_val"}, {"Out"}); } else { - return KernelSignature("uniform_random_raw", + return KernelSignature("uniform_raw", {}, {"shape", "dtype", @@ -65,7 +65,7 @@ KernelSignature UniformRandomOpArgumentMapping( } else { if (ctx.InputSize("ShapeTensorList") > 0) { return KernelSignature( - "uniform_random", + "uniform", {}, {"ShapeTensorList", "dtype", "min", "max", "seed"}, {"Out"}); @@ -73,22 +73,20 @@ KernelSignature UniformRandomOpArgumentMapping( const auto& shape = paddle::any_cast>(ctx.Attr("shape")); if (ctx.HasInput("ShapeTensor") && shape.empty()) { - return KernelSignature("uniform_random", + return KernelSignature("uniform", {}, {"ShapeTensor", "dtype", "min", "max", "seed"}, {"Out"}); } else { - return KernelSignature("uniform_random", - {}, - {"shape", "dtype", "min", "max", "seed"}, - {"Out"}); + return KernelSignature( + "uniform", {}, {"shape", "dtype", "min", "max", "seed"}, {"Out"}); } } } } else if (ctx.IsSelectedRowsOutput("Out")) { if (diag_num) { if (ctx.InputSize("ShapeTensorList") > 0) { - return KernelSignature("uniform_random_raw_sr", + return KernelSignature("uniform_raw_sr", {}, {"ShapeTensorList", "dtype", @@ -103,7 +101,7 @@ KernelSignature UniformRandomOpArgumentMapping( const auto& shape = paddle::any_cast>(ctx.Attr("shape")); if (ctx.HasInput("ShapeTensor") && shape.empty()) { - return KernelSignature("uniform_random_raw_sr", + return KernelSignature("uniform_raw_sr", {}, {"ShapeTensor", "dtype", @@ -115,7 +113,7 @@ KernelSignature UniformRandomOpArgumentMapping( "diag_val"}, {"Out"}); } else { - return KernelSignature("uniform_random_raw_sr", + return KernelSignature("uniform_raw_sr", {}, {"shape", "dtype", @@ -131,7 +129,7 @@ KernelSignature UniformRandomOpArgumentMapping( } else { if (ctx.InputSize("ShapeTensorList") > 0) { return KernelSignature( - "uniform_random_sr", + "uniform_sr", {}, {"ShapeTensorList", "dtype", "min", "max", "seed"}, {"Out"}); @@ -139,12 +137,12 @@ KernelSignature UniformRandomOpArgumentMapping( const auto& shape = paddle::any_cast>(ctx.Attr("shape")); if (ctx.HasInput("ShapeTensor") && shape.empty()) { - return KernelSignature("uniform_random_sr", + return KernelSignature("uniform_sr", {}, {"ShapeTensor", "dtype", "min", "max", "seed"}, {"Out"}); } else { - return KernelSignature("uniform_random_sr", + return KernelSignature("uniform_sr", {}, {"shape", "dtype", "min", "max", "seed"}, {"Out"}); @@ -156,4 +154,6 @@ KernelSignature UniformRandomOpArgumentMapping( } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(uniform_random, uniform); + PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping); diff --git a/paddle/phi/ops/compat/where_index_sig.cc b/paddle/phi/ops/compat/where_index_sig.cc new file mode 100644 index 0000000000000..cfe2a8110cc84 --- /dev/null +++ b/paddle/phi/ops/compat/where_index_sig.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature WhereIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("nonzero", {"Condition"}, {}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(where_index, nonzero); + +PD_REGISTER_ARG_MAPPING_FN(where_index, phi::WhereIndexOpArgumentMapping); diff --git a/paddle/phi/ops/compat/yolov3_loss_sig.cc b/paddle/phi/ops/compat/yolov3_loss_sig.cc index cdd3ace81aa73..f98709a9fdf33 100644 --- a/paddle/phi/ops/compat/yolov3_loss_sig.cc +++ b/paddle/phi/ops/compat/yolov3_loss_sig.cc @@ -17,7 +17,7 @@ namespace phi { KernelSignature Yolov3LossOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("yolov3_loss", + return KernelSignature("yolo_loss", {"X", "GTBox", "GTLabel", "GTScore"}, {"anchors", "anchor_mask", @@ -32,7 +32,7 @@ KernelSignature Yolov3LossOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature Yolov3LossGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "yolov3_loss_grad", + "yolo_loss_grad", {"X", "GTBox", "GTLabel", @@ -51,6 +51,9 @@ KernelSignature Yolov3LossGradOpArgumentMapping( } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(yolov3_loss, yolo_loss); +PD_REGISTER_BASE_KERNEL_NAME(yolov3_loss_grad, yolo_loss_grad); + PD_REGISTER_ARG_MAPPING_FN(yolov3_loss, phi::Yolov3LossOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(yolov3_loss_grad, phi::Yolov3LossGradOpArgumentMapping); diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 5cf54f221362b..052564e7870db 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -309,7 +309,7 @@ def forward(self, var, block=None): if framework._non_static_mode(): if in_dygraph_mode(): - out_var = _C_ops.uniform_random( + out_var = _C_ops.uniform( var.shape, out_dtype, self._low, @@ -711,7 +711,7 @@ def forward(self, var, block=None): if self._uniform: limit = math.sqrt(6.0 / float(fan_in + fan_out)) if in_dygraph_mode(): - out_var = _C_ops.uniform_random( + out_var = _C_ops.uniform( out_var.shape, out_dtype, -limit, @@ -923,7 +923,7 @@ def forward(self, var, block=None): gain = calculate_gain(self._nonlinearity, self._negative_slope) limit = gain * math.sqrt(3.0 / float(fan_in)) if in_dygraph_mode(): - out_var = _C_ops.uniform_random( + out_var = _C_ops.uniform( var.shape, out_dtype, -limit, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 69c00b5dbb1f2..b1a49e23cd7bb 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5385,7 +5385,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): ) ) if in_dygraph_mode(): - return _C_ops.reduce_prod( + return _C_ops.prod( input, dim if dim != None and dim != [] else [0], keep_dim, @@ -15548,7 +15548,7 @@ def where(condition): """ if in_dygraph_mode(): - return _C_ops.where_index(condition) + return _C_ops.nonzero(condition) if _in_legacy_dygraph(): return _legacy_C_ops.where_index(condition) @@ -16567,7 +16567,7 @@ def uniform_random( if in_dygraph_mode(): shape = utils.convert_shape_to_list(shape) - return _C_ops.uniform_random( + return _C_ops.uniform( shape, dtype, float(min), diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index b7e1045b6ee35..8a1c5b9f0b84b 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1017,7 +1017,7 @@ def hsigmoid_loss( # [1.92374969]] """ if in_dygraph_mode(): - out, _, _ = _C_ops.hierarchical_sigmoid( + out, _, _ = _C_ops.hsigmoid_loss( input, weight, label, diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 8f44f3ffe162e..32f63c6f236a2 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1097,7 +1097,7 @@ def tril(x, diagonal=0, name=None): # [9 , 10, 0 , 0 ]]) """ if in_dygraph_mode(): - return _C_ops.tril_triu(x, diagonal, True) + return _C_ops.tril(x, diagonal, True) if _in_legacy_dygraph(): op = getattr(_legacy_C_ops, 'tril_triu') @@ -1163,7 +1163,7 @@ def triu(x, diagonal=0, name=None): """ if in_dygraph_mode(): - return _C_ops.tril_triu(x, diagonal, False) + return _C_ops.tril(x, diagonal, False) if _in_legacy_dygraph(): op = getattr(_legacy_C_ops, 'tril_triu') diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index f367215cd1d3d..f34851fdccade 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3706,7 +3706,7 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None): dim = [0] if in_dygraph_mode(): - return _C_ops.reduce_prod(x, dim, keepdim, reduce_all) + return _C_ops.prod(x, dim, keepdim, reduce_all) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_prod( x, 'dim', dim, 'keep_dim', keepdim, 'reduce_all', reduce_all diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index f5f448cf4ef82..d49941e199bae 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -584,7 +584,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None): if in_dygraph_mode(): shape = utils.convert_shape_to_list(shape) - return _C_ops.uniform_random( + return _C_ops.uniform( shape, dtype, float(min), @@ -664,7 +664,7 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None): # [ 0.433519, 0.39483607, -0.8660099, 0.83664286]] # random """ if in_dygraph_mode(): - return _C_ops.uniform_random_inplace_(x, min, max, seed, 0, 0, 1.0) + return _C_ops.uniform_inplace_(x, min, max, seed, 0, 0, 1.0) else: return _legacy_C_ops.uniform_random_inplace_( x, 'min', min, 'max', max, 'seed', seed diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 165809b39fb62..355bc63f037b6 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -429,7 +429,7 @@ def nonzero(x, as_tuple=False): rank = len(shape) if in_dygraph_mode(): - outs = _C_ops.where_index(x) + outs = _C_ops.nonzero(x) elif paddle.in_dynamic_mode(): outs = _legacy_C_ops.where_index(x) else: diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 519ac1db4c681..1e8fc049efda7 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -197,7 +197,7 @@ def yolo_loss( """ if in_dygraph_mode(): - loss, _, _ = _C_ops.yolov3_loss( + loss, _, _ = _C_ops.yolo_loss( x, gt_box, gt_label, From ec7fe8886695f5b5f759f0a6f29848320452f7ee Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Tue, 1 Nov 2022 18:57:36 +0800 Subject: [PATCH 65/91] Fix bugs in tranpose kernel (#47212) * first commit * transpose_kernel_optimization * first complishment of transpose op * second commit * refine code logics of tranpose_kernel * refine transpose kernel * first commit * fix DtoD copy bugs for hip * refine code according to the PR advice * change dim to int64_t type. * fix some type error --- paddle/fluid/operators/transpose_op.cu.h | 200 +++++++++---------- paddle/fluid/operators/transpose_op.h | 108 ++++------ paddle/phi/kernels/autotune/auto_tune_base.h | 2 +- 3 files changed, 142 insertions(+), 168 deletions(-) diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index 0e1906bedf7b8..eb9e8a7bed784 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h" @@ -832,15 +831,6 @@ class IdxAndOffsetHelper { index_helper = IdxHelper(dims); } - template - explicit IdxAndOffsetHelper(const U* dims) { - T temp_dims[N]; - for (int i = 0; i < N; ++i) { - temp_dims[i] = static_cast(dims[i]); - } - index_helper = IdxHelper(temp_dims); - } - __device__ inline T IndexToOffset(const T* index) const { T offset = 0; #pragma unroll @@ -866,15 +856,17 @@ struct PermuteParams { IdxAndOffsetHelper dst_index_helper; int perm[Rank]{}; - explicit PermuteParams(const std::vector& dims, + explicit PermuteParams(const std::vector& dims, const std::vector& perm_) { - size_t dst_dims[Rank]; - for (size_t i = 0; i < Rank; ++i) { + IndexT dst_dims[Rank]; + IndexT src_dims[Rank]; + for (auto i = 0; i < Rank; ++i) { + src_dims[i] = dims[i]; dst_dims[i] = dims[perm_[i]]; perm[i] = perm_[i]; } dst_index_helper = IdxAndOffsetHelper(dst_dims); - src_index_helper = IdxAndOffsetHelper(dims.data()); + src_index_helper = IdxAndOffsetHelper(src_dims); } }; @@ -966,21 +958,26 @@ template inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count, const PermuteType perm_type, - const std::vector& dims, + const std::vector& dims, const std::vector& perm, const T* src, T* dst) { size_t main_count = count / VecSize; - auto params = PermuteParams(dims, perm); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, main_count); - if (perm_type == PermuteType::kNormalPermute) { + if (perm_type == PermuteType::kGeneralPermute) { size_t tail_count = count - main_count * VecSize; size_t offset = count - tail_count; + auto params = PermuteParams(dims, perm); + GeneralPermuteKernel <<>>( params, src, dst, main_count, tail_count, offset); } else { + std::vector vec_dims(dims); + vec_dims[dims.size() - 1] /= VecSize; + auto params = PermuteParams(vec_dims, perm); + VectorizedPermuteKernel <<>>( params, main_count, src, dst); @@ -991,7 +988,7 @@ template inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx, const IndexT count, const PermuteType perm_type, - const std::vector& dims, + const std::vector& dims, const std::vector& perm, const T* src, T* dst) { @@ -1016,70 +1013,76 @@ inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx, #undef CALL_DISPATCH_RANK } -// Aim at transposing the last 2 dimensions. Refer from +// Aim at transposing the last 2 dimensions. Reference from // https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/ template __global__ void BatchTransposeKernel(const T* __restrict__ src_data, T* dst_data, IndexT rows, - IndexT cols) { + IndexT cols, + IndexT round_tile_rows, + IndexT round_tile_cols) { using VecT = phi::AlignedVector; - - __shared__ VecT tile[kTileSize][kShareCol]; - T* single_tile = reinterpret_cast(tile); - - IndexT col_in_matrix = blockIdx.x * kTileSize + threadIdx.x; - IndexT offset = blockIdx.z * rows * cols; + constexpr int kShareCol = kTileSize + 1; + __shared__ VecT v_shared[kTileSize * kShareCol]; + T* s_shared = reinterpret_cast(v_shared); // Vectorized load data from src into shared memory. [rows, cols] - const VecT* __restrict__ src = + const VecT* __restrict__ vec_src = reinterpret_cast(src_data); - for (IndexT tile_y = threadIdx.y; tile_y < kTileSize; tile_y += kBlockRows) { - IndexT row_in_matrix = tile_y + blockIdx.y * kTileSize; + IndexT col_in_matrix = blockIdx.x * kTileSize + threadIdx.x; + IndexT offset = blockIdx.z * rows * cols; - if (col_in_matrix < cols && row_in_matrix < rows) { - tile[tile_y][threadIdx.x] = - src[offset + row_in_matrix * cols + col_in_matrix]; + if (col_in_matrix < cols) { + int row_range = (blockIdx.y < round_tile_rows) + ? kTileSize + : (rows - kTileSize * round_tile_rows); +#pragma unroll + for (int tile_y = threadIdx.y; tile_y < row_range; tile_y += kBlockRows) { + IndexT row_in_matrix = tile_y + blockIdx.y * kTileSize; + v_shared[tile_y * kShareCol + threadIdx.x] = + vec_src[offset + row_in_matrix * cols + col_in_matrix]; } } - // Singularized load data from shared memory into dst. - // and dst_cols = rows, dst_rows = cols, [cols * Vecsize, rows] + // Write data from shared memory into dst and + // dst_cols = rows, dst_rows = cols * Vecsize col_in_matrix = blockIdx.y * kTileSize + threadIdx.x; offset = offset * VecSize + col_in_matrix; - IndexT tile_x_idx = threadIdx.x * (kShareCol * VecSize); - __syncthreads(); - for (IndexT tile_y = threadIdx.y; tile_y < kTileSize; tile_y += kBlockRows) { - IndexT row_in_matrix = tile_y + blockIdx.x * kTileSize; - IndexT dst_idx = offset + row_in_matrix * VecSize * rows; - IndexT tile_idx = tile_x_idx + tile_y * VecSize; - if (col_in_matrix < /*dst_cols=*/rows && - row_in_matrix < /*dst_rows=*/cols) { + if (col_in_matrix < /*dst_cols=*/rows) { + int col_range = (blockIdx.x < round_tile_cols) + ? kTileSize + : (cols - kTileSize * round_tile_cols); #pragma unroll - for (auto i = 0; i < VecSize; ++i) { - dst_data[dst_idx + i * rows] = single_tile[tile_idx + i]; + for (IndexT tile_y = threadIdx.y; tile_y < col_range; + tile_y += kBlockRows) { +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + IndexT row_in_matrix = (tile_y + blockIdx.x * kTileSize) * VecSize + i; + IndexT shared_idx = (tile_y + threadIdx.x * kShareCol) * VecSize + i; + dst_data[offset + row_in_matrix * rows] = s_shared[shared_idx]; } } } } -// With the byte limitation of shared_memory, the VecSize shall be restricted -// for the type whose byte-size is less than 8. +// With the byte limitation of shared_memory, the VecSize shall be +// restricted for the type whose byte-size is less than 4. template 8 ? 1 : Size)> + int VecSize = (sizeof(T) > 4 ? 1 : Size)> inline void LaunchTransposeKernel(const phi::GPUContext& ctx, - const std::vector& dims, + const std::vector& dims, const T* src, T* dst) { auto rank = dims.size(); IndexT num_batches = (rank == 2) ? 1 : dims[0]; IndexT rows = dims[rank - 2]; - IndexT cols = dims[rank - 1]; + IndexT cols = dims[rank - 1] / VecSize; IndexT num_tile_rows = (rows + kTileSize - 1) / kTileSize; IndexT num_tile_cols = (cols + kTileSize - 1) / kTileSize; @@ -1087,14 +1090,15 @@ inline void LaunchTransposeKernel(const phi::GPUContext& ctx, dim3 threads(kTileSize, kBlockRows, 1); BatchTransposeKernel - <<>>(src, dst, rows, cols); + <<>>( + src, dst, rows, cols, num_tile_rows - 1, num_tile_cols - 1); } template inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx, const int vec_size, const PermuteType perm_type, - const std::vector& dims, + const std::vector& dims, const std::vector& perm, const T* src, T* dst, @@ -1123,60 +1127,50 @@ inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx, #undef CALL_DISPATCH_VEC_SIZE } -template -inline void LaunchWithDispatchIndex(const phi::GPUContext& ctx, - const size_t count, - const int vec_size, - const PermuteType perm_type, - const std::vector& dims, - const std::vector& perm, - const T* src, - T* dst) { - if (count < std::numeric_limits::max()) { - LaunchWithDispatchVecSize(ctx, - vec_size, - perm_type, - dims, - perm, - src, - dst, - static_cast(count)); - } else { - int64_t cnt = static_cast(count); - LaunchWithDispatchVecSize(ctx, - vec_size, - perm_type, - dims, - perm, - src, - dst, - static_cast(count)); - } -} - template -inline void SimplifyThenLaunch(const int rank, - const DeviceContext& ctx, - const phi::DenseTensor& in, - phi::DenseTensor* out, - const std::vector& perm) { - int sm_count = ctx.GetSMCount(); - auto src_dims = phi::vectorize(in.dims()); - auto simplifier = DimsSimplifier( - sm_count, rank, perm, src_dims, in.data(), out->data()); - - if (simplifier.GetPermType() == PermuteType::kCopy) { +inline void PermuteAndTranspose(const int rank, + const DeviceContext& ctx, + const phi::DenseTensor& in, + phi::DenseTensor* out, + const std::vector& perm) { + const int64_t numel = in.numel(); + auto classifier = + TranposeTypeClassifier(ctx.GetSMCount(), + rank, + numel, + perm, + phi::vectorize(in.dims()), + in.data(), + out->data()); + + if (classifier.GetPermType() == PermuteType::kCopy) { // If perm is [0,1,2,3], then just operate a DtoD copy. - phi::Copy(ctx, in, ctx.GetPlace(), false, out); + phi::backends::gpu::GpuMemcpyAsync(out->data(), + in.data(), + numel * sizeof(T), + phi::gpuMemcpyDeviceToDevice, + ctx.stream()); } else { - LaunchWithDispatchIndex(ctx, - simplifier.GetCount(), - simplifier.GetVecSize(), - simplifier.GetPermType(), - simplifier.GetDims(), - simplifier.GetPerm(), - in.data(), - out->data()); + if (numel < std::numeric_limits::max()) { + LaunchWithDispatchVecSize(ctx, + classifier.GetVecSize(), + classifier.GetPermType(), + classifier.GetSrcDims(), + classifier.GetPerm(), + in.data(), + out->data(), + static_cast(numel)); + } else { + int64_t cnt = static_cast(numel); + LaunchWithDispatchVecSize(ctx, + classifier.GetVecSize(), + classifier.GetPermType(), + classifier.GetSrcDims(), + classifier.GetPerm(), + in.data(), + out->data(), + static_cast(numel)); + } } } @@ -1196,7 +1190,7 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx, if (!ret) { auto* tuner = phi::autotune::MakeTransposeTuner(TransCompute); - tuner->AddCallBack(SimplifyThenLaunch); + tuner->AddCallBack(PermuteAndTranspose); size_t key = phi::autotune::TransposeKey( phi::vectorize(in.dims()), diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index a533b17fc175d..45495505e6059 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -71,69 +71,72 @@ enum PermuteType { kCopy = 1, kTranspose = 2, kVecPermute = 3, - kNormalPermute = 4 + kGeneralPermute = 4 }; constexpr int kBlockRows = 16; constexpr int kTileSize = 32; -// To avoid bank conflict. -constexpr int kShareCol = kTileSize + 1; // Simplify the input dims and permute dims if possible. template -class DimsSimplifier { +class TranposeTypeClassifier { public: - explicit DimsSimplifier(const int sm_count, - const int rank, - const std::vector& perm, - const std::vector& dims, - const T* src, - T* dst) - : perm_(rank), dims_(rank) { + TranposeTypeClassifier(const int sm_count, + const size_t rank, + const int64_t numel, + const std::vector& perm, + const std::vector& dims, + const T* src, + T* dst) + : perm_(rank), src_dims(rank) { SimplifyPermAndDims(rank, dims, perm); - count_ = std::accumulate( - dims.begin(), dims.end(), size_t{1}, std::multiplies()); if (rank_ > 1) { vec_size_ = GetPermVecSize(sm_count, src, dst); - perm_.resize(rank_); - dims_.resize(rank_); + } + perm_.resize(rank_); + src_dims.resize(rank_); + dst_dims.resize(rank_); + + for (auto i = 0; i < rank_; ++i) { + dst_dims[i] = src_dims[perm_[i]]; } } - size_t GetCount() const { return count_; } + int GetRank() const { return rank_; } int GetVecSize() const { return vec_size_; } PermuteType GetPermType() const { return type_; } std::vector GetPerm() const { return perm_; } - std::vector GetDims() const { return dims_; } + std::vector GetSrcDims() const { return src_dims; } + std::vector GetDstDims() const { return dst_dims; } private: - size_t rank_{1}; - size_t count_{0}; + int rank_{1}; int vec_size_{1}; std::vector perm_; - std::vector dims_; + std::vector src_dims; + std::vector dst_dims; PermuteType type_{kCopy}; void SimplifyPermAndDims(const size_t rank, - const std::vector& in_dims, + const std::vector& in_dims, const std::vector& perm) { - size_t combined_dims[phi::DDim::kMaxRank]; + int64_t combined_dims[phi::DDim::kMaxRank]; int valid_map[phi::DDim::kMaxRank]; - // Merge consecutive dims to the fist one of this these dims, - // and leave the origin dim value to be 1. Example below : + // Merge consecutive dims to the fist one dim and + // leave original dim to be 1. Example below : // perm: [2, 3, 0, 1], origin_dims : [4, 8, 2, 5] // new_dims: [4, 8, 2, 5] -> [32, 1, 10, 1] - size_t start_perm_idx = 0; + int start_perm_idx = 0; while (start_perm_idx < rank) { - const size_t start_dim_idx = perm[start_perm_idx]; + const int start_dim_idx = perm[start_perm_idx]; combined_dims[start_dim_idx] = in_dims[start_dim_idx]; - size_t end_perm_idx = start_perm_idx + 1; + int end_perm_idx = start_perm_idx + 1; while (end_perm_idx < rank && perm[end_perm_idx] == perm[end_perm_idx - 1] + 1) { - const size_t end_dim_idx = perm[end_perm_idx]; + const int end_dim_idx = perm[end_perm_idx]; combined_dims[start_dim_idx] *= in_dims[end_dim_idx]; combined_dims[end_dim_idx] = 1; end_perm_idx += 1; @@ -145,22 +148,22 @@ class DimsSimplifier { // for example, if combined dims is [32, 1, 10, 1], // valid_map is [0, -1, 1, -1] and generate simplified // dims as [32, 10] - size_t valid_dim_idx = 0; + int valid_dim_idx = 0; bool sequential_flag = false; - for (size_t i = 0; i < rank; ++i) { + for (auto i = 0; i < rank; ++i) { const int src_dim = combined_dims[i]; if (src_dim == 1) { valid_map[i] = -1; } else { sequential_flag = true; valid_map[i] = valid_dim_idx; - dims_[valid_dim_idx] = src_dim; + src_dims[valid_dim_idx] = src_dim; valid_dim_idx += 1; } } if (valid_dim_idx == 0) { - dims_[0] = 1; + src_dims[0] = 1; perm_[0] = 0; return; } else if (valid_dim_idx == 1) { @@ -169,8 +172,8 @@ class DimsSimplifier { // Acquire simplified perm with help of combined dims // and original perm, finally simplified perm is [1, 0] - size_t perm_idx = 0; - for (size_t i = 0; i < rank; ++i) { + int perm_idx = 0; + for (auto i = 0; i < rank; ++i) { const int mapped = valid_map[perm[i]]; if (mapped >= 0) { perm_[perm_idx] = mapped; @@ -183,20 +186,17 @@ class DimsSimplifier { int GetPermVecSize(const int sm_count, const T* src, T* dst) { // For gerneal_permute kernel, there is good chance for // vectorized write. + type_ = PermuteType::kGeneralPermute; int vec_size = phi::GetVectorizedSize(dst); - type_ = PermuteType::kNormalPermute; // While the last dim is fixed, there is good chance for // both vectorized read and write. if (perm_[rank_ - 1] == rank_ - 1) { int tmp_size = std::min(vec_size, phi::GetVectorizedSize(src)); - tmp_size = GetDimVesSize(tmp_size, dims_[rank_ - 1]); + tmp_size = GetDimVesSize(tmp_size, src_dims[rank_ - 1]); if (tmp_size > 1) { type_ = kVecPermute; vec_size = tmp_size; - - // For stride calculation of src_data index. - dims_[rank_ - 1] /= vec_size; } } @@ -205,31 +205,11 @@ class DimsSimplifier { if ((rank_ == 2 && perm_[1] == 0 && perm_[0] == 1) || (rank_ == 3 && perm_[2] == 1 && perm_[1] == 2)) { type_ = PermuteType::kTranspose; - - // Compared with vectorized load or read, set config to let more - // sm work simultaneously affect more according to performance. - constexpr int threads = kTileSize * kTileSize; - int blocks = count_ / threads; - if (blocks < sm_count) { - vec_size = 1; - } else { - int tmp_vec = std::min(vec_size, phi::GetVectorizedSize(src)); - // With bytes limitation of shared_memory, the VecSize shall be - // restricted for the type whose byte-size is less than 8 (double). - int type_vec = - sizeof(T) > 8 ? 1 : GetDimVesSize(tmp_vec, dims_[rank_ - 1]); - for (int i = type_vec; i > 0; i /= 2) { - if (blocks / i >= sm_count) { - break; - } - // When blocks is smaller than sm_count, a test shown that decrease - // vec_size to make blocks close to sm_count would gain performance. - vec_size = i; - } - } - - dims_[rank_ - 1] /= vec_size; - count_ /= vec_size; + int tmp_vec = std::min(vec_size, phi::GetVectorizedSize(src)); + // With bytes limitation of shared_memory, the VecSize shall be + // restricted for the type whose byte-size is less than 8 (double). + vec_size = + sizeof(T) > 8 ? 1 : GetDimVesSize(tmp_vec, src_dims[rank_ - 1]); } return vec_size; } diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index ff97b2a1f48f4..d9f96ec2328f4 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -123,7 +123,7 @@ class AutoTuneBase { float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) { // Regard 1st run as warmup, judge the compare result by the time cost // of rest cycles. - constexpr int repeats = 3; + constexpr int repeats = 4; phi::GpuTimer timer; float time_cost = 0; const auto& stream = ctx.stream(); From 32efda3d27368e875802f63bb59a2f9214cf5c4e Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Tue, 1 Nov 2022 19:14:24 +0800 Subject: [PATCH 66/91] support no_sync attr for params in DataParallel (#47536) --- python/paddle/fluid/dygraph/parallel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 51e0527e4fa99..b90c6bbb4ea50 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -706,7 +706,12 @@ def init_reducer(self): if param.trainable: layers_param.append((sublayer, param)) - trainable_parameters = [param for _, param in layers_param] + trainable_parameters = list( + filter( + lambda x: not getattr(x, "no_sync", False), + [param for _, param in layers_param], + ) + ) assert len(trainable_parameters) > 0, ( "This model does not have any parameters to train, and " From 2a932e55ceddbf1761c822a7792fcf5f1b5ff09e Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Tue, 1 Nov 2022 19:43:59 +0800 Subject: [PATCH 67/91] [geometric] Optimize graph sample speed (#47531) --- .../gpu/graph_sample_neighbors_kernel.cu | 75 +++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu index 6632d3f8b2ec9..3ea1dbc8e19c2 100644 --- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu @@ -58,7 +58,7 @@ struct MaxFunctor { } }; -template +template __global__ void SampleKernel(const uint64_t rand_seed, int k, const int64_t num_nodes, @@ -71,8 +71,7 @@ __global__ void SampleKernel(const uint64_t rand_seed, T* output_eids, int* output_ptr, bool return_eids) { - assert(blockDim.x == WARP_SIZE); - assert(blockDim.y == BLOCK_WARPS); + assert(blockDim.x == CTA_SIZE); int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y; const int64_t last_row = @@ -80,13 +79,13 @@ __global__ void SampleKernel(const uint64_t rand_seed, #ifdef PADDLE_WITH_HIP hiprandState rng; hiprand_init(rand_seed * gridDim.x + blockIdx.x, - threadIdx.y * WARP_SIZE + threadIdx.x, + threadIdx.y * CTA_SIZE + threadIdx.x, 0, &rng); #else - curandState rng; + curandStatePhilox4_32_10_t rng; curand_init(rand_seed * gridDim.x + blockIdx.x, - threadIdx.y * WARP_SIZE + threadIdx.x, + threadIdx.y * CTA_SIZE + threadIdx.x, 0, &rng); #endif @@ -94,7 +93,7 @@ __global__ void SampleKernel(const uint64_t rand_seed, while (out_row < last_row) { T node = nodes[out_row]; if (node > len_col_ptr - 1) { - out_row += BLOCK_WARPS; + out_row += BLOCK_CTAS; continue; } T in_row_start = col_ptr[node]; @@ -102,21 +101,21 @@ __global__ void SampleKernel(const uint64_t rand_seed, int out_row_start = output_ptr[out_row]; if (deg <= k) { - for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) { + for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) { output[out_row_start + idx] = row[in_row_start + idx]; if (return_eids) { output_eids[out_row_start + idx] = eids[in_row_start + idx]; } } } else { - for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) { + for (int idx = threadIdx.x; idx < k; idx += CTA_SIZE) { output[out_row_start + idx] = idx; } #ifdef PADDLE_WITH_CUDA - __syncwarp(); + __syncthreads(); #endif - for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) { + for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); #else @@ -129,10 +128,10 @@ __global__ void SampleKernel(const uint64_t rand_seed, } } #ifdef PADDLE_WITH_CUDA - __syncwarp(); + __syncthreads(); #endif - for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) { + for (int idx = threadIdx.x; idx < k; idx += CTA_SIZE) { T perm_idx = output[out_row_start + idx] + in_row_start; output[out_row_start + idx] = row[perm_idx]; if (return_eids) { @@ -141,7 +140,7 @@ __global__ void SampleKernel(const uint64_t rand_seed, } } - out_row += BLOCK_WARPS; + out_row += BLOCK_CTAS; } } @@ -181,12 +180,12 @@ void SampleNeighbors(const Context& dev_ctx, thrust::exclusive_scan( output_count, output_count + bs, output_ptr.begin(), 0); - constexpr int WARP_SIZE = 32; - constexpr int BLOCK_WARPS = 128 / WARP_SIZE; - constexpr int TILE_SIZE = BLOCK_WARPS * 16; - const dim3 block(WARP_SIZE, BLOCK_WARPS); + constexpr int CTA_SIZE = 128; + constexpr int BLOCK_CTAS = 128 / CTA_SIZE; + constexpr int TILE_SIZE = BLOCK_CTAS; + const dim3 block(CTA_SIZE, BLOCK_CTAS); const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE); - SampleKernel + SampleKernel <<>>( 0, sample_size, @@ -202,7 +201,7 @@ void SampleNeighbors(const Context& dev_ctx, return_eids); } -template +template __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, int k, const int64_t num_rows, @@ -210,8 +209,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, const T* in_rows, T* src, const T* dst_count) { - assert(blockDim.x == WARP_SIZE); - assert(blockDim.y == BLOCK_WARPS); + assert(blockDim.x == CTA_SIZE); int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y; const int64_t last_row = @@ -221,7 +219,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, hiprand_init( rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng); #else - curandState rng; + curandStatePhilox4_32_10_t rng; curand_init( rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng); #endif @@ -229,7 +227,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, while (out_row < last_row) { const T row = in_rows[out_row]; if (row > len_col_ptr - 1) { - out_row += BLOCK_WARPS; + out_row += BLOCK_CTAS; continue; } const T in_row_start = dst_count[row]; @@ -241,7 +239,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, } else { split = deg - k; } - for (int idx = split + threadIdx.x; idx <= deg - 1; idx += WARP_SIZE) { + for (int idx = split + threadIdx.x; idx <= deg - 1; idx += CTA_SIZE) { #ifdef PADDLE_WITH_HIP const int num = hiprand(&rng) % (idx + 1); #else @@ -254,14 +252,14 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, src[in_row_start + idx]))); } #ifdef PADDLE_WITH_CUDA - __syncwarp(); + __syncthreads(); #endif } - out_row += BLOCK_WARPS; + out_row += BLOCK_CTAS; } } -template +template __global__ void GatherEdge(int k, int64_t num_rows, const T* in_rows, @@ -273,8 +271,7 @@ __global__ void GatherEdge(int k, int* output_ptr, T* perm_data, bool return_eids) { - assert(blockDim.x == WARP_SIZE); - assert(blockDim.y == BLOCK_WARPS); + assert(blockDim.x == CTA_SIZE); int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y; const int64_t last_row = @@ -287,7 +284,7 @@ __global__ void GatherEdge(int k, const T out_row_start = output_ptr[out_row]; if (deg <= k) { - for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) { + for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) { outputs[out_row_start + idx] = src[in_row_start + idx]; if (return_eids) { output_eids[out_row_start + idx] = eids[in_row_start + idx]; @@ -304,7 +301,7 @@ __global__ void GatherEdge(int k, end = deg; } - for (int idx = begin + threadIdx.x; idx < end; idx += WARP_SIZE) { + for (int idx = begin + threadIdx.x; idx < end; idx += CTA_SIZE) { outputs[out_row_start + idx - begin] = src[perm_data[in_row_start + idx]]; if (return_eids) { @@ -313,7 +310,7 @@ __global__ void GatherEdge(int k, } } } - out_row += BLOCK_WARPS; + out_row += BLOCK_CTAS; } } @@ -337,13 +334,13 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx, thrust::exclusive_scan( output_count, output_count + bs, output_ptr.begin(), 0); - constexpr int WARP_SIZE = 32; - constexpr int BLOCK_WARPS = 128 / WARP_SIZE; - constexpr int TILE_SIZE = BLOCK_WARPS * 16; - const dim3 block(WARP_SIZE, BLOCK_WARPS); + constexpr int CTA_SIZE = 128; + constexpr int BLOCK_CTAS = 128 / CTA_SIZE; + constexpr int TILE_SIZE = BLOCK_CTAS; + const dim3 block(CTA_SIZE, BLOCK_CTAS); const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE); - FisherYatesSampleKernel + FisherYatesSampleKernel <<>>(0, sample_size, bs, @@ -352,7 +349,7 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx, perm_data, col_ptr); - GatherEdge + GatherEdge <<>>( sample_size, bs, From a341bb8c989afb5cad3199be45ab89ae0152bbda Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Tue, 1 Nov 2022 19:53:18 +0800 Subject: [PATCH 68/91] clean mkldnn headerfile (#47507) --- paddle/fluid/operators/detection/prior_box_op.cc | 6 +----- paddle/fluid/operators/gaussian_random_op.cc | 3 --- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc index 16a4a35f6698d..28251c32ddee9 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -14,14 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/prior_box_op.h" #include +#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/infermeta/binary.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif -#include "paddle/fluid/framework/convert_utils.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index b298ce5635e85..ee095c598bc1b 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -18,9 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif #include "paddle/phi/infermeta/nullary.h" namespace paddle { From db3239273ccba4b974c2e83b28a3fd40c0fa99e6 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Tue, 1 Nov 2022 20:03:44 +0800 Subject: [PATCH 69/91] [Paddle Inference] add RegisterOutputHook interface (#47050) --- paddle/fluid/framework/naive_executor.cc | 13 ++--- paddle/fluid/framework/naive_executor.h | 19 ++++---- .../fluid/inference/api/analysis_predictor.cc | 47 +++++++++++++++---- .../fluid/inference/api/analysis_predictor.h | 12 +++++ .../api/analysis_predictor_tester.cc | 47 +++++++++++++++++++ paddle/fluid/inference/api/paddle_api.h | 11 +++++ .../inference/api/paddle_inference_api.h | 10 ++++ paddle/fluid/inference/api/paddle_tensor.h | 7 +++ paddle/fluid/pybind/inference_api.cc | 5 +- 9 files changed, 145 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index eb4ad8d0daf79..52ed842d74e02 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -65,6 +65,9 @@ void NaiveExecutor::Run() { #ifdef PADDLE_WITH_INFERENCE_NVTX platform::CudaNvtxRangePop(); #endif + if (hookfunc_) { + hookfunc_(op.get()); + } } #ifdef PADDLE_WITH_INFERENCE_NVTX platform::CudaNvtxRangePop(); @@ -142,14 +145,8 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) { return tensor; } -void NaiveExecutor::CleanFeedFetchOps() { - std::vector> ops; - for (auto &op : ops_) { - if (op->Type() != "feed" && op->Type() != "fetch") { - ops.emplace_back(std::move(op)); - } - } - ops_.swap(ops); +void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) { + hookfunc_ = hookfunc; } NaiveExecutor::~NaiveExecutor() { diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 8ca3f5997af46..882f50b451a29 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,10 +25,6 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" -namespace phi { -class DenseTensor; -} // namespace phi - namespace paddle { namespace framework { @@ -40,6 +37,8 @@ class Scope; class NaiveExecutor { public: + using HookFunc = std::function; + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} ~NaiveExecutor(); @@ -66,13 +65,13 @@ class NaiveExecutor { // Get an tensor to operating directly, without the need for feed_ops. phi::DenseTensor* FindTensor(const std::string& name); - Scope* scope() { return scope_; } - - void CleanFeedFetchOps(); + Scope* GetScope() { return scope_; } void ResetTrtOps(int num); - protected: + void RegisterOutputHook(const HookFunc& hookfunc); + + private: void CreateOps(const ProgramDesc& desc, int block_id, bool with_feed_fetch_ops); @@ -81,7 +80,9 @@ class NaiveExecutor { const platform::Place place_; // Catch the required resource to avoid recreate. std::vector> ops_; - Scope* scope_; + Scope* scope_{nullptr}; + + HookFunc hookfunc_{nullptr}; }; } // namespace framework diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9197efc2a5edb..280427cb4c8f3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -32,6 +32,7 @@ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/var_type_traits.h" @@ -1557,10 +1558,10 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { - scope = executor_->scope(); + scope = executor_->GetScope(); } #else - scope = executor_->scope(); + scope = executor_->GetScope(); #endif PADDLE_ENFORCE_NOT_NULL( scope->FindVar(name), @@ -1612,10 +1613,10 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { - scope = executor_->scope(); + scope = executor_->GetScope(); } #else - scope = executor_->scope(); + scope = executor_->GetScope(); #endif PADDLE_ENFORCE_NOT_NULL( scope->FindVar(name), @@ -1997,7 +1998,7 @@ void AnalysisPredictor::ClearIntermediateTensor() { for (auto *var : global_block->AllVars()) { if (!IsPersistable(var)) { const std::string name = var->Name(); - auto *variable = executor_->scope()->FindVar(name); + auto *variable = executor_->GetScope()->FindVar(name); if (variable != nullptr && variable->IsType() && name != "feed" && name != "fetch") { VLOG(3) << "Clear Intermediate Tensor: " << name; @@ -2178,6 +2179,33 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) { exe.Run(save_program, scope(), 0, true, true); } +void AnalysisPredictor::RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) { + if (config_.enable_memory_optim()) { + LOG(WARNING) << "If you want to run output hook function, you should " + "use config.EnableMemoryOptim(false) to turn off memory " + "reuse!"; + return; + } + static std::once_flag register_hook_flag; + std::call_once(register_hook_flag, [this] { + executor_->RegisterOutputHook([this](framework::OperatorBase *op) { + for (auto &output : op->Outputs()) { + for (auto &var_name : output.second) { + auto *var = this->sub_scope_->FindVar(var_name); + if (!var || !var->IsType()) continue; + auto dense_tensor = var->Get(); + if (!dense_tensor.initialized()) continue; + auto tensor = this->GetOutputTensor(var_name); + for (auto &hookfunc : this->hookfuncs_) { + hookfunc(op->Type(), var_name, *tensor); + } + } + } + }); + }); + hookfuncs_.push_back(hookfunc); +} + template <> std::unique_ptr CreatePaddlePredictor( const AnalysisConfig &config) { @@ -2371,6 +2399,10 @@ void Predictor::ClearIntermediateTensor() { uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); } +void Predictor::RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) { + predictor_->RegisterOutputHook(hookfunc); +} + void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); } int GetNumBytesOfDataType(DataType dtype) { @@ -2452,10 +2484,9 @@ PredictorPool::PredictorPool(const Config &config, size_t size) { for (size_t i = 0; i < size - 1; i++) { if (config.tensorrt_engine_enabled()) { Config config_tmp(copy_config); - preds_.push_back( - std::move(std::unique_ptr(new Predictor(config_tmp)))); + preds_.emplace_back(new Predictor(config_tmp)); } else { - preds_.push_back(std::move(main_pred_->Clone())); + preds_.emplace_back(main_pred_->Clone()); } } } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index d1dd921db1484..37d1511fa272d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -272,6 +272,16 @@ class AnalysisPredictor : public PaddlePredictor { /// std::string GetSerializedProgram() const override; + /// + /// \brief Register a output hook function to operate the intermediate tensor + /// of op output. when using this function, memory reuse should be tured off. + /// The hook function signature is void(const std::string&, const + /// std::string&, const Tensor&>). Here, the first parameter is op's + /// type, the second param is output var name of the op, and the third + /// parameter is output tensor with the var name. + /// + void RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) override; + /// /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass /// @@ -510,6 +520,8 @@ class AnalysisPredictor : public PaddlePredictor { int predictor_id_; private: + std::vector hookfuncs_; + // Some status here that help to determine the status inside the predictor. bool status_is_cloned_{false}; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 8856ceb61a76f..5cba8f06ab977 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -611,4 +611,51 @@ TEST(Predictor, Streams) { } #endif +TEST(AnalysisPredictor, OutputHookFunc) { + auto hookfunc = [](const std::string& type, + const std::string& var_name, + const Tensor& tensor) { LOG(INFO) << "in hook function"; }; + + { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + + predictor->RegisterOutputHook(hookfunc); + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + auto* w0_data = w0->mutable_data(PlaceType::kCPU); + auto* w1_data = w1->mutable_data(PlaceType::kCPU); + auto* w2_data = w2->mutable_data(PlaceType::kCPU); + auto* w3_data = w3->mutable_data(PlaceType::kCPU); + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + predictor->Run(); + predictor->TryShrinkMemory(); + } + + { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableMemoryOptim(); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + + predictor->RegisterOutputHook(hookfunc); + } +} + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index ffb634ce82968..ff1ec1eba3025 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -38,6 +38,7 @@ namespace paddle { using PaddleDType = paddle_infer::DataType; using PaddlePlace = paddle_infer::PlaceType; using PaddleDataLayout = paddle_infer::DataLayout; +using paddle_infer::Exp_OutputHookFunc; /// \brief Memory manager for PaddleTensor. /// @@ -289,6 +290,16 @@ class PD_INFER_DECL PaddlePredictor { /// virtual uint64_t TryShrinkMemory() { return 0; } + /// + /// \brief Register a output hook function to operate the intermediate tensor + /// of op output. when using this function, memory reuse should be tured off. + /// The hook function signature is void(const std::string&, const + /// std::string&, const Tensor&>). Here, the first parameter is op's + /// type, the second param is output var name of the op, and the third + /// parameter is output tensor with the var name. + /// + virtual void RegisterOutputHook(const Exp_OutputHookFunc& hookfunc) {} + /// \brief Clone an existing predictor /// When using clone, the same network will be created, /// and the parameters between them are shared. diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 055cf3a13fbaf..1a52c011b2a80 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -157,6 +157,16 @@ class PD_INFER_DECL Predictor { /// uint64_t TryShrinkMemory(); + /// + /// \brief Register a output hook function to operate the intermediate tensor + /// of op output. when using this function, memory reuse should be tured off. + /// The hook function signature is void(const std::string&, const + /// std::string&, const Tensor&>). Here, the first parameter is op's + /// type, the second param is output var name of the op, and the third + /// parameter is output tensor with the var name. + /// + void RegisterOutputHook(const Exp_OutputHookFunc& hookfunc); + /// /// \brief Get the execution stream on devices with a concept of stream, /// otherwise returns nullptr. diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index b10f051d6e44e..9bc95f251eb60 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -14,7 +14,10 @@ #pragma once +#include +#include #include +#include #include "paddle_infer_declare.h" // NOLINT @@ -29,6 +32,10 @@ namespace paddle_infer { /// Strings for text data. using Strings = std::vector; +class Tensor; +using Exp_OutputHookFunc = + std::function; + typedef void (*CallbackFunc)(void*); #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 5d2a579907883..9b99cad869315 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/pybind/inference_api.h" +#include #include #include @@ -946,7 +947,9 @@ void BindPaddleInferPredictor(py::module *m) { #endif .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory) .def("clear_intermediate_tensor", - &paddle_infer::Predictor::ClearIntermediateTensor); + &paddle_infer::Predictor::ClearIntermediateTensor) + .def("register_output_hook", + &paddle_infer::Predictor::RegisterOutputHook); } void BindZeroCopyTensor(py::module *m) { From f913404589569d5076120fe2bf380446cccdc7e9 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Tue, 1 Nov 2022 20:51:45 +0800 Subject: [PATCH 70/91] [Kernel Selection] Remove hard code of PADDLE_WITH_CUDA (#47325) * move cudnn hardcode outside GetExpectedKernelType * add header file * debug * update interpreter_util with hardcode * update interpreter_util headerfile * solve activation hardcode * debug with CI * add mkldnn_op_list header file * temporarily uncomment mkldnn * temporarily uncomment mkldnn * delete sequence_softmax cudnn hardcode * add hardcode to data_transfer.cc * update data_transfer headerfile * try fix segment fault * update cudnn&miopen_helper * reset HasAttr of DygraphExctnCtx * debug, this commit should pass all CI * debug should pass CI, temporarily disable activation * debug should pass CI * fix default_attr=nullptr bug * clean debug code --- .../new_executor/interpreter/data_transfer.cc | 9 +++++++ .../interpreter/interpreter_util.cc | 9 +++++++ paddle/fluid/framework/operator.cc | 24 +++++++++++++++++ paddle/fluid/imperative/execution_context.h | 3 ++- paddle/fluid/imperative/prepared_operator.cc | 9 +++++++ paddle/fluid/operators/activation_op.cc | 8 ++++++ paddle/fluid/operators/affine_grid_op.cc | 23 +++------------- paddle/fluid/operators/conv_transpose_op.cc | 27 ------------------- paddle/fluid/operators/grid_sampler_op.cc | 26 +++--------------- paddle/fluid/operators/pool_op.cc | 21 ++------------- .../sequence_ops/sequence_softmax_op.cc | 16 ----------- paddle/fluid/operators/softmax_op.cc | 16 ----------- .../platform/device/gpu/cuda/cudnn_helper.h | 4 +-- .../platform/device/gpu/rocm/miopen_helper.h | 4 +-- 14 files changed, 75 insertions(+), 124 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc index bf51ebd1d48d7..20ccdece426c5 100644 --- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc @@ -22,6 +22,9 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/phi/backends/onednn/onednn_context.h" #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#endif namespace paddle { namespace framework { @@ -133,6 +136,12 @@ void DataTranferHelper::RunAndConstructOpFuncNode( auto* dev_ctx = pool.Get(place_); auto exec_ctx = ExecutionContext(*op, Scope(), *dev_ctx, runtime_context); auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(exec_ctx); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!op_with_kernel->DnnFallback() && + paddle::platform::CanCUDNNBeUsed(exec_ctx)) { + expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; + } +#endif VLOG(6) << "expected_kernel_key " << expected_kernel_key << "\n"; VLOG(6) << "op_with_kernel Type() " << op_with_kernel->Type() << "\n"; diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 104217fa80f22..816331e3fa549 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -32,6 +32,9 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#endif PADDLE_DEFINE_EXPORTED_bool( new_executor_serial_run, @@ -615,6 +618,12 @@ void BuildOpFuncList(const platform::Place& place, *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context); auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(exec_ctx); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!op_with_kernel->DnnFallback() && + paddle::platform::CanCUDNNBeUsed(exec_ctx)) { + expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; + } +#endif VLOG(4) << "expected_kernel_key : " << expected_kernel_key; // change device by the device_guard() ApplyDeviceGuard(op, place, &expected_kernel_key); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b4714407686d8..5d24758de0b93 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -58,6 +58,10 @@ class DenseTensor; #include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#endif + DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); @@ -1409,6 +1413,14 @@ bool OperatorWithKernel::SupportsKernelType( } #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!this->DnnFallback() && paddle::platform::CanCUDNNBeUsed(exe_ctx)) { + auto tmp_kernel_type = kernel_type; + tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN; + return kernels.find(tmp_kernel_type) != kernels.end(); + } +#endif + return kernel_iter != kernels.end(); } @@ -1589,6 +1601,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!this->DnnFallback() && paddle::platform::CanCUDNNBeUsed(exe_ctx)) { + kernel_type_->library_type_ = framework::LibraryType::kCUDNN; + } +#endif + // NOTE(Liu-xiandong):In my ctest, this branch do not be executed, // I can't understand it, it's really confusing. // But we still need to keep this to avoid errors. @@ -1832,6 +1850,12 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!this->DnnFallback() && paddle::platform::CanCUDNNBeUsed(ctx)) { + expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; + } +#endif + if (HasAttr("op_device")) { if (Attr("op_device") == "cpu") { expected_kernel_key.place_ = platform::CPUPlace(); diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index 6d4f7c347b097..4ac885dbe3f97 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -103,7 +103,8 @@ class DygraphExecutionContext : public framework::ExecutionContext { bool HasAttr(const std::string& name) const override { if (attrs_.find(name) == attrs_.end()) { - return default_attrs_.find(name) != default_attrs_.end(); + return &default_attrs_ != nullptr && + default_attrs_.find(name) != default_attrs_.end(); } return true; } diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index d76e06bd4143e..2a35474285113 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -28,6 +28,9 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_op_list.h" #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#endif #include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" @@ -246,6 +249,12 @@ PreparedOp PrepareImpl( } #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!op.DnnFallback() && paddle::platform::CanCUDNNBeUsed(dygraph_exe_ctx)) { + expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN; + } +#endif + #if defined(PADDLE_WITH_XPU) bool is_xpu_unsupport = paddle::platform::is_xpu_place(expected_kernel_key.place_) && diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 6a239da553a58..b4cf9e9e009de 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -93,6 +93,14 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, // library = framework::LibraryType::kCUDNN; // } // #endif + + // NOTE(jiahongyu): Activation ops have attribute use_cudnn, but cudnn kernels + // are temporarily disabled. Therefore, cudnn kernel also needs to fallback to + // plain GPU kernel temporarily. When above codes are uncommented, below + // fallback codes can be deleted safely. + if (paddle::platform::is_gpu_place(ctx.GetPlace())) { + oper.SetDnnFallback(true); + } return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 8d123710e750e..2d7eb04f1dba0 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -134,15 +134,8 @@ class AffineGridOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - framework::LibraryType library{framework::LibraryType::kPlain}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - library = framework::LibraryType::kCUDNN; - } -#endif auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Theta"); - return framework::OpKernelType( - data_type, ctx.GetPlace(), phi::DataLayout::kAnyLayout, library); + return framework::OpKernelType(data_type, ctx.GetPlace()); } }; @@ -252,17 +245,9 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - framework::LibraryType library_{framework::LibraryType::kPlain}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kCUDNN; - } -#endif - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Output")), - ctx.GetPlace(), - phi::DataLayout::kAnyLayout, - library_); + auto data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Output")); + return framework::OpKernelType(data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index f5702f2179431..e9c4245bc4731 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -28,9 +28,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#endif namespace paddle { namespace operators { @@ -40,14 +37,6 @@ using DataLayout = phi::DataLayout; framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - phi::DataLayout::kAnyLayout, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } @@ -268,14 +257,6 @@ The input(X) size and output(Out) size may be different. framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - phi::DataLayout::kAnyLayout, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } @@ -343,14 +324,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker { framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - phi::DataLayout::kAnyLayout, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 77865647c4c5b..7f57d6e288f87 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -35,17 +35,8 @@ class GridSampleOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - framework::LibraryType library_{framework::LibraryType::kPlain}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kCUDNN; - } -#endif - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace(), - phi::DataLayout::kAnyLayout, - library_); + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace()); } }; @@ -146,17 +137,8 @@ class GridSampleOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - framework::LibraryType library_{framework::LibraryType::kPlain}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kCUDNN; - } -#endif - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace(), - phi::DataLayout::kAnyLayout, - library_); + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 7842de9b17a3b..48bfa3576ab6c 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -44,21 +44,13 @@ bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - framework::LibraryType library_{framework::LibraryType::kPlain}; - phi::DataLayout layout_ = phi::DataLayout::kAnyLayout; auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kCUDNN; - } -#endif - // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_MKLDNN this->SetDnnFallback(!CanMKLDNNSupportPool(ctx)); // NOTE(jiahongyu) END: Above codes originally enclosed by PADDLE_WITH_MKLDNN - return framework::OpKernelType(data_type, ctx.GetPlace(), layout_, library_); + return framework::OpKernelType(data_type, ctx.GetPlace()); } framework::OpKernelType PoolOp::GetKernelTypeForVar( @@ -86,22 +78,13 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - framework::LibraryType library_{framework::LibraryType::kPlain}; - phi::DataLayout layout_ = phi::DataLayout::kAnyLayout; auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kCUDNN; - } -#endif - // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_MKLDNN this->SetDnnFallback(!CanMKLDNNSupportPool(ctx)); // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN - return framework::OpKernelType( - input_data_type, ctx.GetPlace(), layout_, library_); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType PoolOpGrad::GetKernelTypeForVar( diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 5b4b9aef88637..80f13a51ab0b1 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -43,14 +43,6 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { if (ctx.HasAttr("data_format")) { layout_ = phi::StringToDataLayout(ctx.Attr("data_format")); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - layout_, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_); } }; @@ -135,14 +127,6 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { if (ctx.HasAttr("data_format")) { layout_ = phi::StringToDataLayout(ctx.Attr("data_format")); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - layout_, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 42e0e5250e084..bc11f53e00935 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -48,14 +48,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "float16 can only be used on GPU/NPU/XPU/MLU and custom place")); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - layout_, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_); } }; @@ -140,14 +132,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { PADDLE_THROW(platform::errors::InvalidArgument( "float16 can only be used on GPU/NPU/XPU/MLU and custom place")); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::CanCUDNNBeUsed(ctx)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - layout_, - framework::LibraryType::kCUDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_); } }; diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h index 4fa25476336f6..595f47d98e56b 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h @@ -617,8 +617,8 @@ class ScopedActivationDescriptor { }; inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { - bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn"); - use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); + bool use_cudnn = paddle::platform::is_gpu_place(ctx.GetPlace()) && + ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn"); #ifdef PADDLE_WITH_CUDA if (use_cudnn) { auto& dev_ctx = ctx.device_context(); diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index 0c9d6d24cd1bf..019fdce9e044b 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -554,8 +554,8 @@ class ScopedActivationDescriptor { }; inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { - bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn"); - use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); + bool use_cudnn = paddle::platform::is_gpu_place(ctx.GetPlace()) && + ctx.HasAttr("use_cudnn") && ctx.Attr("use_cudnn"); #ifdef PADDLE_WITH_HIP if (use_cudnn) { auto& dev_ctx = ctx.device_context(); From 9d801855afccf4a71fca42808eb7b21ade6c9c09 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 1 Nov 2022 21:48:04 +0800 Subject: [PATCH 71/91] fix dynamic link of xpu library (#47434) * refine comments,test=kunlun * link xpu lib, test=kunlun * add sleep for test, test=kunlun * merge develop, fix compile, test=kunlun * remove debug code, test=kunlun * add dependency to avoid potential concurrency error, test=kunlun --- paddle/CMakeLists.txt | 10 +++++----- paddle/fluid/operators/collective/c_comm_init_op.cc | 2 -- python/CMakeLists.txt | 10 ++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 895bd9db9cf64..80f4c6bb542d7 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -49,18 +49,15 @@ endif() list(LENGTH test_names len) if(${len} GREATER_EQUAL 1) - message("Total tests: ${len}") + message("Total cpp tests using dynamic link: ${len}") math(EXPR stop "${len} - 1") foreach(idx RANGE ${stop}) if(WITH_TESTING) list(GET test_srcs ${idx} test_src) list(GET test_names ${idx} test_name) get_property(test_arg GLOBAL PROPERTY "${test_name}_ARGS") - message("add test ${test_name}") + # message("add test ${test_name}") add_executable(${test_name} ${test_src}) - # target_link_libraries( - # ${test_name} - # ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/libpaddle${POSTFIX}) target_link_libraries(${test_name} $) target_link_libraries(${test_name} paddle_gtest_main_new) add_dependencies(${test_name} ${paddle_lib} paddle_gtest_main_new) @@ -75,6 +72,9 @@ if(${len} GREATER_EQUAL 1) target_link_libraries(${test_name} "-Wl,-rpath,$") endif() + if(WITH_XPU) + target_link_libraries(${test_name} xpulib) + endif() if(NOT ("${test_name}" STREQUAL "c_broadcast_op_npu_test" OR "${test_name}" STREQUAL "c_allreduce_sum_op_npu_test" diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc index 59fbd02b5c086..26e700262f8dd 100644 --- a/paddle/fluid/operators/collective/c_comm_init_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -57,11 +57,9 @@ class CCommInitOp : public framework::OperatorBase { using CommContext = platform::NCCLCommContext; #elif defined(PADDLE_WITH_XPU_BKCL) using UniqueId = BKCLUniqueId; - using Place = platform::XPUPlace; using CommContext = platform::BKCLCommContext; #elif defined(PADDLE_WITH_CNCL) using UniqueId = cnclCliqueId; - using Place = platform::MLUPlace; using CommContext = platform::CNCLCommContext; #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b7d8eb1dcbc59..3a3c98a9e9956 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -48,6 +48,16 @@ set(FLUID_CORE_DEPS ${FLUID_CORE}) add_custom_target(copy_libpaddle ALL DEPENDS ${FLUID_CORE_DEPS}) +# NOTE(zhiqiu): WHY? +# In `setup.py.in`, some dynamic libraries (eg, libxpuapi.so) are modified using +# patchelf. In rare cases, if the a linker is linking that dynamic library for +# some executables at the same time, a `file not recognized, file truncated` +# error may occur, result in the compilation error. +# So, add dependency to force the cpp tests built before running `setup.py.in`. +if(WITH_TESTING) + add_dependencies(copy_libpaddle build_tests) +endif() + if(WIN32) add_custom_command( OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp From a35a4a53221e190c4ba566245eb72900c75b7165 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 1 Nov 2022 22:14:52 +0800 Subject: [PATCH 72/91] [CodeStyle][E711] use `is`/`is not` for comparison with `None` (#47452) * [CodeStyle][E711] use `is`/`is not` for comparison with `None` * `self.assertTrue($A is None)` -> `self.assertIsNone($A)` * `self.assertTrue($A is not None)` -> `self.assertIsNotNone($A)` * `self.assertFalse($A is None)` -> `self.assertIsNotNone($A)` * `self.assertEqual($A, None)` -> `self.assertIsNone($A)` * `self.assertNotEqual($A, None)` -> `self.assertIsNotNone($A)` --- paddle/scripts/conda_build.py | 8 ++--- python/paddle/cost_model/cost_model.py | 2 +- python/paddle/dataset/imdb.py | 2 +- python/paddle/dataset/tests/imdb_test.py | 4 +-- .../auto_parallel/cost/base_cost.py | 2 +- .../distributed/auto_parallel/partitioner.py | 2 +- .../auto_parallel/process_group.py | 2 +- .../auto_parallel/tuner/optimization_tuner.py | 2 +- python/paddle/distributed/elastic.py | 2 +- .../distributed/fleet/base/role_maker.py | 36 +++++++++---------- .../fleet/base/strategy_compiler.py | 6 ++-- .../distributed/fleet/base/util_factory.py | 4 ++- .../fleet/data_generator/data_generator.py | 4 +-- python/paddle/distributed/fleet/fleet.py | 2 +- python/paddle/distributed/fleet/launch.py | 2 +- .../paddle/distributed/fleet/launch_utils.py | 6 ++-- .../graph_execution_optimizer.py | 2 +- .../fleet/meta_optimizers/ps_optimizer.py | 2 +- .../fleet/runtime/parameter_server_runtime.py | 2 +- .../distributed/fleet/runtime/the_one_ps.py | 6 ++-- python/paddle/distributed/fleet/utils/fs.py | 2 +- .../fleet/utils/hybrid_parallel_inference.py | 2 +- python/paddle/distributed/ps/coordinator.py | 2 +- python/paddle/distributed/ps/the_one_ps.py | 6 ++-- python/paddle/distributed/ps/utils/public.py | 2 +- python/paddle/distributed/rpc/rpc.py | 2 +- .../paddle/distributed/utils/launch_utils.py | 2 +- python/paddle/fluid/backward.py | 2 +- python/paddle/fluid/communicator.py | 20 +++++------ python/paddle/fluid/contrib/layers/nn.py | 2 +- .../paddle/fluid/contrib/layers/rnn_impl.py | 4 +-- .../post_training_quantization.py | 2 +- .../slim/quantization/quantization_pass.py | 16 ++++----- .../quantization/quantize_transpiler_v2.py | 2 +- .../fluid/contrib/slim/quantization/utils.py | 2 +- .../tests/test_user_defined_quantization.py | 2 +- .../test_weight_quantization_mobilenetv1.py | 2 +- python/paddle/fluid/device_worker.py | 8 ++--- python/paddle/fluid/distributed/helper.py | 6 ++-- .../dygraph_to_static/return_transformer.py | 2 +- python/paddle/fluid/dygraph/layers.py | 2 +- python/paddle/fluid/executor.py | 4 +-- python/paddle/fluid/framework.py | 3 +- .../incubate/checkpoint/auto_checkpoint.py | 6 ++-- .../fluid/incubate/data_generator/__init__.py | 4 +-- .../fluid/incubate/fleet/base/role_maker.py | 2 +- .../incubate/fleet/collective/__init__.py | 4 +-- .../fleet/parameter_server/ir/trainer_pass.py | 2 +- .../pslib/optimizer_factory.py | 2 +- .../paddle/fluid/incubate/fleet/utils/hdfs.py | 2 +- .../fluid/incubate/fleet/utils/utils.py | 2 +- python/paddle/fluid/io.py | 6 ++-- python/paddle/fluid/layers/control_flow.py | 2 +- python/paddle/fluid/layers/nn.py | 34 +++++++++--------- python/paddle/fluid/metrics.py | 2 +- python/paddle/fluid/net_drawer.py | 4 +-- python/paddle/fluid/optimizer.py | 20 +++++------ .../custom_op/test_custom_raw_op_kernel_op.py | 2 +- .../asp/test_asp_optimize_dynamic.py | 4 +-- .../unittests/auto_parallel/test_strategy.py | 6 ++-- .../auto_parallel/test_while_op_partition.py | 2 +- .../collective/fleet/test_auto_checkpoint.py | 16 ++++----- .../fleet/test_auto_checkpoint_dist_basic.py | 2 +- .../fleet/test_auto_checkpoint_multiple.py | 2 +- .../fleet/test_fleet_rolemaker_new.py | 4 +-- .../unittests/collective/fleet/test_hdfs1.py | 2 +- .../collective/init_process_group.py | 4 +-- .../unittests/dist_text_classification.py | 2 +- .../distributed_passes/dist_pass_test_base.py | 2 +- .../seq2seq_dygraph_model.py | 12 +++---- .../test_decorator_transform.py | 2 +- .../dygraph_to_static/test_function_spec.py | 2 +- .../unittests/dygraph_to_static/test_place.py | 2 +- .../unittests/ir_memory_optimize_net_base.py | 2 +- .../unittests/test_adaptive_avg_pool2d.py | 6 ++-- .../unittests/test_adaptive_avg_pool3d.py | 8 ++--- .../unittests/test_adaptive_max_pool2d.py | 6 ++-- .../unittests/test_adaptive_max_pool3d.py | 8 ++--- .../unittests/test_auto_parallel_mapper.py | 6 ++-- .../test_auto_parallel_partitioner_gpt.py | 14 ++++---- .../fluid/tests/unittests/test_base_layer.py | 2 +- .../test_decoupled_py_reader_data_check.py | 4 +-- .../unittests/test_dygraph_spectral_norm.py | 2 +- .../unittests/test_dygraph_weight_norm.py | 2 +- .../test_eager_deletion_delete_vars.py | 4 +-- .../test_eager_deletion_padding_rnn.py | 6 ++-- .../fluid/tests/unittests/test_fleet_base.py | 2 +- .../fluid/tests/unittests/test_fleet_util.py | 4 +-- .../tests/unittests/test_fused_matmul_bias.py | 2 +- .../test_global_var_getter_setter.py | 4 +-- .../unittests/test_imperative_auto_prune.py | 34 +++++++++--------- .../tests/unittests/test_imperative_basic.py | 20 +++++------ .../unittests/test_imperative_double_grad.py | 4 +-- .../unittests/test_imperative_optimizer.py | 6 ++-- .../unittests/test_imperative_optimizer_v2.py | 6 ++-- .../unittests/test_imperative_save_load.py | 6 ++-- .../test_imperative_selected_rows.py | 20 +++++------ .../unittests/test_inference_model_io.py | 2 +- .../fluid/tests/unittests/test_input_spec.py | 4 +-- .../fluid/tests/unittests/test_lambv2_op.py | 4 +-- .../fluid/tests/unittests/test_layers.py | 12 +++---- .../test_paddle_imperative_double_grad.py | 2 +- .../fluid/tests/unittests/test_profiler.py | 4 +-- .../fluid/tests/unittests/test_pylayer_op.py | 14 ++++---- .../fluid/tests/unittests/test_regularizer.py | 4 +-- .../unittests/test_tensor_register_hook.py | 2 +- .../fluid/tests/unittests/test_var_base.py | 8 ++--- .../fluid/tests/unittests/test_var_info.py | 2 +- .../paddle/fluid/tests/unittests/testsuite.py | 2 +- .../tests/unittests/xpu/test_matmul_op_xpu.py | 2 +- python/paddle/fluid/trainer_desc.py | 10 +++--- python/paddle/fluid/trainer_factory.py | 4 +-- .../fluid/transpiler/distribute_transpiler.py | 2 +- .../fluid/transpiler/geo_sgd_transpiler.py | 2 +- python/paddle/nn/functional/common.py | 2 +- python/paddle/nn/functional/norm.py | 2 +- python/paddle/nn/functional/pooling.py | 10 +++--- python/paddle/nn/layer/distance.py | 2 +- python/paddle/nn/layer/norm.py | 18 +++++----- python/paddle/optimizer/optimizer.py | 2 +- python/paddle/profiler/profiler.py | 2 +- python/paddle/profiler/utils.py | 2 +- python/paddle/sparse/nn/layer/norm.py | 10 +++--- python/paddle/static/io.py | 2 +- python/paddle/tensor/einsum.py | 2 +- python/paddle/tensor/linalg.py | 12 +++---- python/paddle/tensor/manipulation.py | 4 +-- python/paddle/tensor/math.py | 10 +++--- python/paddle/tensor/search.py | 2 +- python/paddle/tests/test_utils_lazyimport.py | 2 +- python/paddle/text/datasets/imdb.py | 2 +- python/paddle/vision/ops.py | 2 +- tools/analysisPyXml.py | 2 +- tools/check_op_desc.py | 2 +- tools/get_single_test_cov.py | 2 +- ...rate_pd_op_dialect_from_paddle_op_maker.py | 2 +- tools/test_runner.py | 2 +- 137 files changed, 357 insertions(+), 352 deletions(-) diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py index 98692ff3df39c..ed3f1709884c1 100644 --- a/paddle/scripts/conda_build.py +++ b/paddle/scripts/conda_build.py @@ -171,7 +171,7 @@ def meta_build_mac(var, python_str, paddle_version, build_var, build_name_str): def meta_build_linux( var, python_str, paddle_version, build_var, build_name_str, cuda_str=None ): - if cuda_str == None: + if cuda_str is None: package_str = ( """ package: @@ -192,7 +192,7 @@ def meta_build_linux( ) meta_build = var.build + build_name_str meta_str = package_str + meta_build + requirement - if not (cuda_str == None): + if not (cuda_str is None): meta_str = meta_str + cuda_str meta_str = meta_str + var.test + var.about @@ -209,7 +209,7 @@ def meta_build_linux( def meta_build_windows( var, python_str, paddle_version, blt_var, build_name_str, cuda_str=None ): - if cuda_str == None: + if cuda_str is None: package_str = ( """ package: @@ -235,7 +235,7 @@ def meta_build_windows( meta_build = var.build + build_name_str meta_str = package_str + meta_build + requirement - if not (cuda_str == None): + if not (cuda_str is None): meta_str = meta_str + cuda_str blt_str = var.blt_const + blt_var diff --git a/python/paddle/cost_model/cost_model.py b/python/paddle/cost_model/cost_model.py index 8797868287ba0..b3178d2e05a66 100644 --- a/python/paddle/cost_model/cost_model.py +++ b/python/paddle/cost_model/cost_model.py @@ -74,7 +74,7 @@ def static_cost_data(self): def get_static_op_time(self, op_name, forward=True, dtype="float32"): # if forward is True, return op forward time, otherwise return op backward time. - if op_name == None: + if op_name is None: raise ValueError( 'op_name should not be empty when you want to get static op time' ) diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py index e95a9e6df0066..622d33aa1873c 100644 --- a/python/paddle/dataset/imdb.py +++ b/python/paddle/dataset/imdb.py @@ -45,7 +45,7 @@ def tokenize(pattern): # tarfile.extractfile, which does random access and might # destroy hard disks. tf = tarf.next() - while tf != None: + while tf is not None: if bool(pattern.match(tf.name)): # newline and punctuations removal and ad-hoc tokenization. yield tarf.extractfile(tf).read().rstrip(b'\n\r').translate( diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py index c2a787cecd863..32dbc75b874e1 100644 --- a/python/paddle/dataset/tests/imdb_test.py +++ b/python/paddle/dataset/tests/imdb_test.py @@ -31,13 +31,13 @@ class TestIMDB(unittest.TestCase): word_idx = None def test_build_dict(self): - if self.word_idx == None: + if self.word_idx is None: self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150) self.assertEqual(len(self.word_idx), 7036) def check_dataset(self, dataset, expected_size): - if self.word_idx == None: + if self.word_idx is None: self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150) sum = 0 diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py index 2ce98a0a0511e..35353de9b66d5 100644 --- a/python/paddle/distributed/auto_parallel/cost/base_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py @@ -587,7 +587,7 @@ def get_max_beta(self, ranks): if forward_order_beta > backward_order_beta else backward_order_beta ) - if max_beta == None: + if max_beta is None: max_beta = beta else: if beta > max_beta: diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index 2ba3398132437..2a7b7f3e67daa 100644 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -84,7 +84,7 @@ def partition( dist_op_context.rank_id = self._rank_id # partition startup program - if serial_startup_program == None: + if serial_startup_program is None: partitioned_startup_prog = None else: partitioned_startup_prog = self.partition_startup_program( diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 10d2556f299ce..ebe478f1dff02 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -61,7 +61,7 @@ def new_process_group(ranks, group_id=None): num_groups = len(_g_process_group_map) # Note: our process group may interfere with the original implementation # so the created group id should start from the original _new_ring_id() - if group_id == None: + if group_id is None: group_id = _new_ring_id() + num_groups + 1 new_pg = ProcessGroup(group_id, ranks) diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py index a5cdbc7f95a91..3cd58f2c00402 100644 --- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py +++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py @@ -530,7 +530,7 @@ def _update(self, i, trial, results): self._finished_trials.append(trial) cur_mertic = get_metric(results) - if self._best_metric == None or cur_mertic > self._best_metric: + if self._best_metric is None or cur_mertic > self._best_metric: self._best_metric = cur_mertic self._best_iter = i diff --git a/python/paddle/distributed/elastic.py b/python/paddle/distributed/elastic.py index d1fd9a790f2d4..55b73ab315bb4 100644 --- a/python/paddle/distributed/elastic.py +++ b/python/paddle/distributed/elastic.py @@ -31,7 +31,7 @@ def set_np(self, np): self.etcd.put(self.np_path, '{}'.format(np).encode('latin-1')) def scale_np(self, np): - if self.etcd.get(self.np_path)[0] != None: + if self.etcd.get(self.np_path)[0] is not None: self.set_np(np) return True return False diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 28dace611a092..3a9e7e7aa47a3 100755 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -293,7 +293,7 @@ def __get_default_iface_from_gateway(self): if "Gateway" in item and "Iface" in item: gateway_idx = item.index("Gateway") iface_idx = item.index("Iface") - elif gateway_idx != None and iface_idx != None: + elif gateway_idx is not None and iface_idx is not None: gateway = None if len(item) > gateway_idx: gateway = item[gateway_idx] @@ -845,7 +845,7 @@ def _ps_env(self): # each role will execute it self._server_endpoints = self._server_endpoints.split(",") self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None) - if self._worker_endpoints != None: + if self._worker_endpoints is not None: self._worker_endpoints = self._worker_endpoints.split(",") else: self._worker_endpoints = [] @@ -860,14 +860,14 @@ def _ps_env(self): # each role will execute it self._coordinator_endpoints = self._coordinator_endpoints.split(",") trainers_num = os.getenv("PADDLE_TRAINERS_NUM", None) - if trainers_num == None: + if trainers_num is None: raise ValueError( "Can not find PADDLE_TRAINERS_NUM, please check your environment." ) trainers_num = int(trainers_num) training_role = os.getenv("TRAINING_ROLE", None) - if training_role == None: + if training_role is None: raise ValueError( "Can not find TRAINING_ROLE, please check your environment." ) @@ -937,20 +937,20 @@ def _ps_env(self): # each role will execute it if training_role == "TRAINER": role = Role.WORKER current_id = os.getenv("PADDLE_TRAINER_ID", None) - if current_id == None: + if current_id is None: raise ValueError( "Can not find PADDLE_TRAINER_ID, please check your environment." ) current_id = int(current_id) if self._is_heter_parameter_server_mode: self._stage_id = os.getenv("STAGE_ID", None) - if self._stage_id == None: + if self._stage_id is None: raise ValueError( "Can not find STAGE_ID, please check your environment." ) self._stage_id = int(self._stage_id) self._stage_num = os.getenv("STAGE_NUM", None) - if self._stage_num == None: + if self._stage_num is None: raise ValueError( "Can not find STAGE_NUM, please check your environment." ) @@ -958,18 +958,18 @@ def _ps_env(self): # each role will execute it self._stage_trainers = os.getenv( "PADDLE_STAGE_TRAINERS_NUM", None ) - if self._stage_trainers == None: + if self._stage_trainers is None: raise ValueError( "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment." ) self._stage_trainers = eval(self._stage_trainers) cur_port = os.getenv("PADDLE_PORT", None) - if cur_port == None: + if cur_port is None: raise ValueError( "Can not find PADDLE_PORT, please check your environment." ) cur_ip = os.getenv("POD_IP", None) - if cur_ip == None: + if cur_ip is None: raise ValueError( "Can not find POD_IP, please check your environment." ) @@ -982,12 +982,12 @@ def _ps_env(self): # each role will execute it elif training_role == "PSERVER": role = Role.SERVER cur_port = os.getenv("PADDLE_PORT", None) - if cur_port == None: + if cur_port is None: raise ValueError( "Can not find PADDLE_PORT, please check your environment." ) cur_ip = os.getenv("POD_IP", None) - if cur_ip == None: + if cur_ip is None: raise ValueError( "Can not find POD_IP, please check your environment." ) @@ -997,20 +997,20 @@ def _ps_env(self): # each role will execute it elif training_role == "HETER_TRAINER": role = Role.HETER_WORKER self._stage_id = os.getenv("STAGE_ID", None) - if self._stage_id == None: + if self._stage_id is None: raise ValueError( "Can not find STAGE_ID, please check your environment." ) self._stage_id = int(self._stage_id) self._stage_num = os.getenv("STAGE_NUM", None) - if self._stage_num == None: + if self._stage_num is None: raise ValueError( "Can not find STAGE_NUM, please check your environment." ) self._stage_num = int(self._stage_num) self._stage_trainers = os.getenv("PADDLE_STAGE_TRAINERS_NUM", None) - if self._stage_trainers == None: + if self._stage_trainers is None: raise ValueError( "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment." ) @@ -1019,7 +1019,7 @@ def _ps_env(self): # each role will execute it self._heter_trainer_device_type = os.getenv( "HETER_DEVICE_TYPE", None ) - if self._heter_trainer_device_type == None: + if self._heter_trainer_device_type is None: raise ValueError( "Can not find HETER_DEVICE_TYPE, please check your environment." ) @@ -1040,12 +1040,12 @@ def _ps_env(self): # each role will execute it ) cur_port = os.getenv("PADDLE_PORT", None) - if cur_port == None: + if cur_port is None: raise ValueError( "Can not find PADDLE_PORT, please check your environment." ) cur_ip = os.getenv("POD_IP", None) - if cur_ip == None: + if cur_ip is None: raise ValueError( "Can not find POD_IP, please check your environment." ) diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py index 14db852f7708d..d161eb680793a 100644 --- a/python/paddle/distributed/fleet/base/strategy_compiler.py +++ b/python/paddle/distributed/fleet/base/strategy_compiler.py @@ -204,13 +204,13 @@ def generate_optimizer( ) return_meta = ( - None if meta_optimizers == None else meta_optimizers[0] + None if meta_optimizers is None else meta_optimizers[0] ) return_graph = ( - None if graph_optimizers == None else graph_optimizers[0] + None if graph_optimizers is None else graph_optimizers[0] ) - if meta_optimizers == None or graph_optimizers == None: + if meta_optimizers is None or graph_optimizers is None: return return_meta, return_graph # do heuristic filter here, if any meta optimizer in graph optimizers is in diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py index 5af1acb941358..dcaa256a26d11 100755 --- a/python/paddle/distributed/fleet/base/util_factory.py +++ b/python/paddle/distributed/fleet/base/util_factory.py @@ -509,7 +509,9 @@ def check_not_expected_ops(prog, not_expected_op_types): } for each_var in saved_params: var_temp = fluid.global_scope().find_var(each_var.name) - assert var_temp != None, "can't not find var: " + each_var.name + assert var_temp is not None, ( + "can't not find var: " + each_var.name + ) new_shape = (np.array(var_temp.get_tensor())).shape assert each_var.name in orig_para_shape, ( each_var.name + "MUST in var list" diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py index 9e9ec7b61bc46..ec6114dd21f3a 100644 --- a/python/paddle/distributed/fleet/data_generator/data_generator.py +++ b/python/paddle/distributed/fleet/data_generator/data_generator.py @@ -79,7 +79,7 @@ def local_iter(): batch_samples = [] line_iter = self.generate_sample(None) for user_parsed_line in line_iter(): - if user_parsed_line == None: + if user_parsed_line is None: continue batch_samples.append(user_parsed_line) if len(batch_samples) == self.batch_size_: @@ -121,7 +121,7 @@ def local_iter(): for line in sys.stdin: line_iter = self.generate_sample(line) for user_parsed_line in line_iter(): - if user_parsed_line == None: + if user_parsed_line is None: continue batch_samples.append(user_parsed_line) if len(batch_samples) == self.batch_size_: diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index 2630fa8283eed..695f03fe1f2a0 100644 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -1285,7 +1285,7 @@ def _minimize_impl( context["origin_main_program"] = self.origin_main_program context["origin_main_programs"] = [self.origin_main_program] context["loss"] = loss - if startup_program == None: + if startup_program is None: self.origin_startup_program = ( paddle.static.default_startup_program().clone(for_test=False) ) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 998f64c3ec293..ecf6436b94fd5 100755 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -796,7 +796,7 @@ def launch(): ) # which_distributed_mode must modify args.backend else: assert ( - args.run_mode == 'collective' or args.run_mode == None + args.run_mode == 'collective' or args.run_mode is None ), "When backend is not 'auto', run mode must be collective" check_backend(args.backend) distribute_mode = DistributeMode.COLLECTIVE diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index d4b6b86119fa3..e471535c274bc 100755 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -120,7 +120,7 @@ def pods_endpoints(self): for pod in self.pods: ep = "{}:{}".format(pod.addr, pod.port) assert ( - pod.port != None and pod.addr != None + pod.port is not None and pod.addr is not None ), "{} not a valid endpoint".format(ep) r.append(ep) return r @@ -979,7 +979,7 @@ def get_custom_endpoints(origin_endpoints, offset=0): origin_endpoint: ip:port user_define_endpoint: ip:(port+offset) """ - assert origin_endpoints != None + assert origin_endpoints is not None paddle_user_define_endpoints_list = [] for ip_port in origin_endpoints.split(","): ip = ip_port.split(":")[0] @@ -1625,7 +1625,7 @@ def get_role_endpoints(self, args): else: self.is_local = False pod_ip = os.getenv("POD_IP", None) - if pod_ip == None: + if pod_ip is None: _, self.current_node_ip = get_host_name_ip() else: self.current_node_ip = pod_ip diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index a1a33992d5946..9b077f26a9f62 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -269,7 +269,7 @@ def _enable_strategy(self, dist_strategy, context): def minimize( self, loss, startup_program=None, parameter_list=None, no_grad_set=None ): - if startup_program == None: + if startup_program is None: startup_program = paddle.static.default_startup_program() compiled_program = self._try_to_compile( startup_program, loss.block.program, loss diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 000ef98bcaec4..5dfa2cb7e68df 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -133,7 +133,7 @@ def minimize_impl( self.inner_opt.minimize( loss, startup_program, parameter_list, no_grad_set ) - if startup_program == None: + if startup_program is None: startup_program = paddle.static.default_startup_program() # print("program after inner optimizer minimize:", diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py index e2c5e5da29ea4..b746a757f5165 100644 --- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py +++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py @@ -82,7 +82,7 @@ def build_compiled_startegy(self): def _load_sparse_params( self, executor, dirname, varnames, main_program=None ): - assert vars != None + assert vars is not None check_vars = [] load_prog = Program() load_block = load_prog.global_block() diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index 7de34aa6e1c85..f5cdd7f8dd74c 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -997,7 +997,7 @@ def _add_tensor_table(tables): tensor_table_dict = self.compiled_strategy.get_tensor_table_dict() program_idx = 0 for table_name in tensor_table_dict: - if tensor_table_dict[table_name]["startup_program"] != None: + if tensor_table_dict[table_name]["startup_program"] is not None: tensor_table_dict[table_name][ "startup_program_id" ] = program_idx @@ -1005,7 +1005,7 @@ def _add_tensor_table(tables): tensor_table_dict[table_name]["startup_program"].desc ) program_idx += 1 - if tensor_table_dict[table_name]["main_program"] != None: + if tensor_table_dict[table_name]["main_program"] is not None: tensor_table_dict[table_name][ "main_program_id" ] = program_idx @@ -1241,7 +1241,7 @@ def _stop_worker(self): self._communicator.stop() if self.role_maker._is_heter_parameter_server_mode: assert ( - self._heter_client != None + self._heter_client is not None ), "heter client should not be None in heterps mode" self._heter_client.stop() # executor = self._get_executor() diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index 046a5aee69d3e..8a67301e17461 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -574,7 +574,7 @@ def _ls_dir(self, fs_path): def _test_match(self, lines): for l in lines: m = self._bd_err_re.match(l) - if m != None: + if m is not None: return m return None diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py index 6dd100a6f9e70..a56c71fc40b5d 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py @@ -466,7 +466,7 @@ def _find_prev_op(self, index, var_name): variable named var_name. """ prev_ops = self._output_var_to_op[var_name] - if prev_ops == None: + if prev_ops is None: return None result_op = None for prev_op, prev_idx in reversed(prev_ops): diff --git a/python/paddle/distributed/ps/coordinator.py b/python/paddle/distributed/ps/coordinator.py index 3a6e0756df435..a357d9677abf1 100755 --- a/python/paddle/distributed/ps/coordinator.py +++ b/python/paddle/distributed/ps/coordinator.py @@ -287,7 +287,7 @@ def callback_init_worker(self): fleet.init_worker() def callback_initialize_model_params(self): - if self.exe == None or self.main_program == None: + if self.exe is None or self.main_program is None: raise AssertionError("exe or main_program not set") self.exe.run(self.startup_program) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index d341a95b24be7..825801f17aeb9 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1326,7 +1326,7 @@ def sync_strategy_envs(): ) # --> HeterClient::GetInstance def _init_coordinator(self, scopes=None): - if self._coordinator == None: + if self._coordinator is None: self._coordinator = Coordinator(self.string_hosts) print(">>> curr node ip: {}".format(self.coordinator_hosts[0])) @@ -1336,7 +1336,7 @@ def _init_coordinator(self, scopes=None): ) def _make_fl_strategy(self): - if self._coordinator == None: + if self._coordinator is None: assert "Coordinator py object is null!" else: self._coordinator.make_fl_strategy() @@ -1401,7 +1401,7 @@ def _stop_worker(self): self._worker.stop_worker() if self.is_heter_ps_mode: assert ( - self._heter_client != None + self._heter_client is not None ), "heter client should not be None in heterps mode" self._heter_client.stop() diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index 53628ad7e5084..578d664dc4137 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -671,7 +671,7 @@ def _is_heter_op(op, current_heter_device, default_device="cpu"): # Todo: need update this method # op._set_attr('op_device', current_heter_device) return True - elif op_device == None or op_device == default_device: + elif op_device is None or op_device == default_device: op._set_attr('op_device', default_device) return False return False diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py index 6a93f27517c21..e01446a53744a 100644 --- a/python/paddle/distributed/rpc/rpc.py +++ b/python/paddle/distributed/rpc/rpc.py @@ -106,7 +106,7 @@ def init_rpc(name, rank=None, world_size=None, master_endpoint=None): logger.info("Trainer {}: worker endpoint: {}".format(rank, worker_endpoint)) master_endpoint = ( master_endpoint - if master_endpoint != None + if master_endpoint is not None else os.environ["PADDLE_MASTER_ENDPOINT"] ) master_addr, master_port = master_endpoint.split(":") diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py index 88acc643ead83..975f5d4935644 100644 --- a/python/paddle/distributed/utils/launch_utils.py +++ b/python/paddle/distributed/utils/launch_utils.py @@ -180,7 +180,7 @@ def pods_endpoints(self): for pod in self.pods: ep = "{}:{}".format(pod.addr, pod.port) assert ( - pod.port != None and pod.addr != None + pod.port is not None and pod.addr is not None ), "{} not a valid endpoint".format(ep) r.append(ep) diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 7823b3f7bd47a..e73e2fe1ab10b 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -1942,7 +1942,7 @@ def append_backward( # sub-block (control flow) is_recompute = False if ( - checkpoints != None + checkpoints is not None and isinstance(checkpoints, list) and len(checkpoints) > 0 ): diff --git a/python/paddle/fluid/communicator.py b/python/paddle/fluid/communicator.py index be3fb5ea22deb..d947d7b154370 100755 --- a/python/paddle/fluid/communicator.py +++ b/python/paddle/fluid/communicator.py @@ -63,8 +63,8 @@ def __init__(self, mode, kwargs=None, envs=None): """ # set all recv op to not_run mode - if kwargs == None: - if envs == None: + if kwargs is None: + if envs is None: envs = {} else: if mode == DistributedMode.SYNC: @@ -97,7 +97,7 @@ def __init__(self, mode, kwargs=None, envs=None): def init_with_ctx( self, send_ctx, recv_ctx, proto_txt, unit64_hosts, scope=None ): - if scope == None: + if scope is None: scope = global_scope() self.communicator_ = core.DistCommunicator( self.mode, @@ -144,7 +144,7 @@ def start(self): comm.start() comm.stop() """ - if self.communicator_ == None: + if self.communicator_ is None: print('you must call init_with_ctx first to init comm before start') return self.communicator_.start() @@ -166,7 +166,7 @@ def stop(self): comm.start() comm.stop() """ - if self.communicator_ == None: + if self.communicator_ is None: print('you must call init_with_ctx first to init comm before stop') return self.communicator_.stop() @@ -187,7 +187,7 @@ def is_running(self): comm = fluid.communicator.Communicator(prog) comm.is_running() """ - if self.communicator_ == None: + if self.communicator_ is None: print('you must call init_with_ctx first to init comm before stop') return self.communicator_.is_running() @@ -202,7 +202,7 @@ def pull_dense(self, context): self.communicator_.pull_dense(context) def push_sparse_param(self, var_name, table_id=-1, scope=None): - if scope == None: + if scope is None: scope = global_scope() if not self.is_running(): raise ValueError( @@ -226,14 +226,14 @@ def __init__(self, ps_hosts, kwargs=None): self.init_with_ctx(send_ctx, dense_map, prototxt, ps_hosts) def start_coordinator(self, self_endpoint, trainer_endpoints): - if self.communicator_ != None: + if self.communicator_ is not None: self.communicator_.start_coordinator( self_endpoint, trainer_endpoints ) return def save_fl_strategy(self, mp): - if self.communicator_ != None: + if self.communicator_ is not None: self.communicator_.save_fl_strategy(mp) else: raise ValueError("self.communicator_ is null") @@ -241,7 +241,7 @@ def save_fl_strategy(self, mp): def query_fl_clients_info(self): info_mp = {} - if self.communicator_ != None: + if self.communicator_ is not None: info_mp = self.communicator_.query_fl_clients_info() return info_mp diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 08ab46dfac9eb..3695f8cad20d7 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -1242,7 +1242,7 @@ def sparse_embedding( ) entry_str = entry._to_attr() - if slot == None: + if slot is None: slot = 0 helper.append_op( diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py index 668ce445f78b0..a84ddcc968a1f 100644 --- a/python/paddle/fluid/contrib/layers/rnn_impl.py +++ b/python/paddle/fluid/contrib/layers/rnn_impl.py @@ -375,7 +375,7 @@ def get_single_direction_output( rnn.step_output(new_hidden) step_input = new_hidden - if dropout_prob != None and dropout_prob > 0.0: + if dropout_prob is not None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, @@ -677,7 +677,7 @@ def get_single_direction_output( rnn.step_output(new_cell) step_input = new_hidden - if dropout_prob != None and dropout_prob > 0.0: + if dropout_prob is not None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index c959fc29bc918..3db16060e0e60 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -344,7 +344,7 @@ def __init__( # Save input params self._bias_correction = bias_correction self._executor = executor - self._scope = global_scope() if scope == None else scope + self._scope = global_scope() if scope is None else scope self._model_dir = model_dir self._model_filename = model_filename self._params_filename = params_filename diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 81aac8823d813..7a9b89866ebfb 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1874,8 +1874,8 @@ def __init__( '%s_grad' % (op) for op in self._quantizable_op_type ] - assert self._scope != None, "scope must not be None." - assert self._place != None, "place must not be None." + assert self._scope is not None, "scope must not be None." + assert self._place is not None, "place must not be None." def apply(self, graph): """ @@ -2737,8 +2737,8 @@ def __init__( '%s_grad' % (op) for op in self._quantizable_op_type ] - assert self._scope != None, "scope must not be None." - assert self._place != None, "place must not be None." + assert self._scope is not None, "scope must not be None." + assert self._place is not None, "place must not be None." self.persistable_vars = [] def apply(self, graph): @@ -2878,8 +2878,8 @@ def __init__(self, scope, place, quant_bits=8): self._place = _get_paddle_place(place) self._scope = scope self._quant_bits = quant_bits - assert self._scope != None, "scope must not be None." - assert self._place != None, "place must not be None." + assert self._scope is not None, "scope must not be None." + assert self._place is not None, "place must not be None." def apply(self, graph): assert isinstance( @@ -3027,8 +3027,8 @@ def __init__( self._bias_correction = bias_correction self._quant_bits = quant_bits self._save_int_weight = save_int_weight - assert self._scope != None, "scope must not be None." - assert self._place != None, "place must not be None." + assert self._scope is not None, "scope must not be None." + assert self._place is not None, "place must not be None." def apply(self, graph): assert isinstance( diff --git a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py index 12f6f3c53d262..dbc6277a3bf10 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py @@ -162,7 +162,7 @@ def convert(self, test_program, scope=None): scope(fluid.Scope, optional): The scope of the program, use it to load and save variables. If scope=None, get scope by global_scope(). """ - scope = global_scope() if scope == None else scope + scope = global_scope() if scope is None else scope for block in test_program.blocks: for op in block.ops: diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py index 11e39116389c2..9862772c64a98 100644 --- a/python/paddle/fluid/contrib/slim/quantization/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/utils.py @@ -332,7 +332,7 @@ def set_variable_data(scope, place, var_name, np_value): np_value, np.ndarray ), 'The type of value should be numpy array.' var_node = scope.find_var(var_name) - if var_node != None: + if var_node is not None: tensor = var_node.get_tensor() tensor.set(np_value, place) diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py index 96635700666e2..cc8136e3b7b4c 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py +++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py @@ -219,7 +219,7 @@ def save_dict(Dict, mapping_table_path): mapping_table = load_dict(mapping_table_path) test_graph.out_node_mapping_table = mapping_table - if act_quantize_func == None and weight_quantize_func == None: + if act_quantize_func is None and weight_quantize_func is None: freeze_pass.apply(test_graph) tempdir.cleanup() diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py index 929eb34994b46..8a8099df945e1 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py +++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py @@ -40,7 +40,7 @@ def _set_variable_data(scope, place, var_name, np_value): np_value, np.ndarray ), 'The type of value should be numpy array.' var_node = scope.find_var(var_name) - if var_node != None: + if var_node is not None: tensor = var_node.get_tensor() tensor.set(np_value, place) diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index ee82c7ebbdc10..9b23a942928ef 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -109,7 +109,7 @@ def _gen_worker_desc(self, trainer_desc): dense_table_set = set() program_id = str(id(self._program)) print("device worker program id:", program_id) - if self._program == None: + if self._program is None: print("program of current device worker is not configured") exit(-1) opt_info = self._program._fleet_opt @@ -259,7 +259,7 @@ def _gen_worker_desc(self, trainer_desc): dense_table_set = set() program_id = str(id(self._program)) print("device worker program id:", program_id) - if self._program == None: + if self._program is None: print("program of current device worker is not configured") exit(-1) opt_info = self._program._fleet_opt @@ -392,7 +392,7 @@ def _gen_worker_desc(self, trainer_desc): """ dense_table_set = set() program_id = str(id(self._program)) - if self._program == None: + if self._program is None: print("program of current device worker is not configured") exit(-1) opt_info = self._program._fleet_opt @@ -511,7 +511,7 @@ def _gen_worker_desc(self, trainer_desc): """ dense_table_set = set() program_id = str(id(self._program)) - if self._program == None: + if self._program is None: print("program of current device worker is not configured") exit(-1) opt_info = self._program._fleet_opt diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py index 9511ce2db629f..4a1643733393a 100644 --- a/python/paddle/fluid/distributed/helper.py +++ b/python/paddle/fluid/distributed/helper.py @@ -34,9 +34,9 @@ def __init__( passwd=None, hadoop_bin="", ): - assert user != None - assert passwd != None - assert hadoop_bin != None + assert user is not None + assert passwd is not None + assert hadoop_bin is not None import ps_pb2 as pslib self.fs_client = pslib.FsClientParameter() diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py index 80bebbf501e55..dd6d7feb558e1 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py @@ -85,7 +85,7 @@ def visit_Return(self, node): if isinstance(node.value, gast.Name) and node.value.id == 'None': node.value = None return node - if isinstance(node.value, gast.Constant) and node.value.value == None: + if isinstance(node.value, gast.Constant) and node.value.value is None: node.value = None return node return node diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 889a910d8b9ac..661cfaaa00118 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1046,7 +1046,7 @@ def forward(self, input): for prefix, layer in model.named_sublayers(): print(prefix, layer) """ - assert isinstance(sublayer, Layer) or sublayer == None + assert isinstance(sublayer, Layer) or sublayer is None self._sub_layers[name] = sublayer return sublayer diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 55a0334c8e933..2dbf2d5cedafc 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -622,7 +622,7 @@ def _as_lodtensor(data, place, dtype=None): class FetchHandler(object): def __init__(self, var_dict=None, period_secs=60): - assert var_dict != None + assert var_dict is not None self.var_dict = var_dict self.period_secs = period_secs @@ -2309,7 +2309,7 @@ def _run_from_dataset( ) else: # cache trainer instance for heterps pipeline training - if fetch_list == None: + if fetch_list is None: fetch_list = [] cache_key = _get_strong_program_cache_key(program, None, fetch_list) trainer_instance = self._get_trainer_cache(cache_key) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 188ff9a8ea8d7..63ec07fe741ce 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2880,7 +2880,8 @@ def __init__( ) if 'force_cpu' in op_attrs: if ( - type == 'less_than' and op_attrs['force_cpu'] != None + type == 'less_than' + and op_attrs['force_cpu'] is not None ) or op_attrs['force_cpu'] != False: warnings.warn( "The Attr(force_cpu) of Op(%s) will be deprecated in the future, " diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py index 134723cdbc0c7..5fa0ed085b187 100644 --- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py +++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py @@ -46,7 +46,7 @@ def _get_logger(log_level, name="auto_checkpoint"): global logger - if logger != None: + if logger is not None: return logger logger = logging.getLogger(name) @@ -683,12 +683,12 @@ def _get_valid_program(prog): def _auto_checkpoint(exe, prog): _get_checker() - assert exe._auto_checkpoint_name != None + assert exe._auto_checkpoint_name is not None if not _can_auto_checkpoint(prog): return program = _get_valid_program(prog) - assert program._auto_checkpoint_name != None + assert program._auto_checkpoint_name is not None exe_status = g_train_epoch_range._exe_status key = _get_running_key( diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py index 10e4fba92dd5a..4729f44f2b12c 100644 --- a/python/paddle/fluid/incubate/data_generator/__init__.py +++ b/python/paddle/fluid/incubate/data_generator/__init__.py @@ -80,7 +80,7 @@ def local_iter(): batch_samples = [] line_iter = self.generate_sample(None) for user_parsed_line in line_iter(): - if user_parsed_line == None: + if user_parsed_line is None: continue batch_samples.append(user_parsed_line) if len(batch_samples) == self.batch_size_: @@ -117,7 +117,7 @@ def local_iter(): for line in sys.stdin: line_iter = self.generate_sample(line) for user_parsed_line in line_iter(): - if user_parsed_line == None: + if user_parsed_line is None: continue batch_samples.append(user_parsed_line) if len(batch_samples) == self.batch_size_: diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index 3349dcf275b3f..341eea35d1ce9 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -1010,7 +1010,7 @@ def __get_default_iface_from_gateway(self): if "Gateway" in item and "Iface" in item: gateway_idx = item.index("Gateway") iface_idx = item.index("Iface") - elif gateway_idx != None and iface_idx != None: + elif gateway_idx is not None and iface_idx is not None: gateway = None if len(item) > gateway_idx: gateway = item[gateway_idx] diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py index 229d6e44bd621..1b72d9db8aaa0 100644 --- a/python/paddle/fluid/incubate/fleet/collective/__init__.py +++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py @@ -170,7 +170,7 @@ def save_checkpoint( """ This function save persistables and current epoch num to path. """ - if main_program == None: + if main_program is None: main_program = self._transpiled_program m = PaddleModel(executor, main_program) @@ -203,7 +203,7 @@ def load_checkpoint( This function load persistables and current epoch num from path. """ - if main_program == None: + if main_program is None: main_program = self._transpiled_program m = PaddleModel(executor, main_program) diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py index fa818e3c41300..66973f4355981 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py @@ -737,7 +737,7 @@ def _is_heter_op(op, current_heter_device, default_device="cpu"): # Todo: need update this method # op._set_attr('op_device', current_heter_device) return True - elif op_device == None or op_device == default_device: + elif op_device is None or op_device == default_device: op._set_attr('op_device', default_device) return False return False diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index ed9db255146e5..e48c199cc24a5 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -499,7 +499,7 @@ def _minimize( for num in range(len(losses)): loss = losses[num] parameters = None - if parameter_list != None: + if parameter_list is not None: parameters = parameter_list[num] prog_id = str(id(loss.block.program)) # param_grads of program diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py index c35e266357e0e..e3f4c7c6acf71 100644 --- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py +++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py @@ -163,7 +163,7 @@ def _ls_dir(self, fs_path): def _test_match(self, lines): for l in lines: m = self._bd_err_re.match(l) - if m != None: + if m is not None: return m return None diff --git a/python/paddle/fluid/incubate/fleet/utils/utils.py b/python/paddle/fluid/incubate/fleet/utils/utils.py index c675fea39bc9c..ef022c96ecf86 100644 --- a/python/paddle/fluid/incubate/fleet/utils/utils.py +++ b/python/paddle/fluid/incubate/fleet/utils/utils.py @@ -256,7 +256,7 @@ def try_load_model_vars( } for each_var in saved_params: var_temp = fluid.global_scope().find_var(each_var.name) - assert var_temp != None, "can't not find var: " + each_var.name + assert var_temp is not None, "can't not find var: " + each_var.name new_shape = (np.array(var_temp.get_tensor())).shape assert each_var.name in orig_para_shape, ( each_var.name + "MUST in var list" diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index c5e55a95186a2..175cbb6fe3533 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1013,7 +1013,7 @@ def name_has_fc(var): if not isinstance(each_var, Parameter): continue var_temp = paddle.fluid.global_scope().find_var(each_var.name) - assert var_temp != None, "can't not find var: " + each_var.name + assert var_temp is not None, "can't not find var: " + each_var.name new_shape = (np.array(var_temp.get_tensor())).shape assert each_var.name in orig_para_shape, ( each_var.name + "MUST in var list" @@ -2146,7 +2146,7 @@ def load(program, model_path, executor=None, var_list=None): return elif os.path.isfile(model_path): - if var_list == None: + if var_list is None: raise ValueError( "var_list is required when loading model file saved with [ save_params, save_persistables, save_vars ]" ) @@ -2479,7 +2479,7 @@ def set_program_state(program, state_dict): for para in parameter_list: var_temp = paddle.fluid.global_scope().find_var(para.name) assert ( - var_temp != None + var_temp is not None ), "Variable [ {} ] Not found, Please make sure run startup program".format( para.name ) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 5b79e3b86fadf..6555fff7b8550 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1953,7 +1953,7 @@ def less_than(x, y, force_cpu=None, cond=None, name=None): ) if cond is not None: check_type(cond, "cond", Variable, "less_than") - if force_cpu != None: + if force_cpu is not None: check_type(force_cpu, "force_cpu", bool, "less_than") helper = LayerHelper("less_than", **locals()) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b1a49e23cd7bb..6ef406202f862 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3856,7 +3856,7 @@ def data_norm( bias_default = param_attr.get("bias", 0.0) # create scale and shift(bias) when enable_scale_and_shift is True - if name == None: + if name is None: name = "dn" if enable_scale_and_shift: scale_w = helper.create_parameter( @@ -5234,17 +5234,17 @@ def reduce_max(input, dim=None, keep_dim=False, name=None): dim = [dim] if in_dygraph_mode(): - return _C_ops.max(input, dim if dim != None else [], keep_dim) + return _C_ops.max(input, dim if dim is not None else [], keep_dim) helper.append_op( type='reduce_max', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None and dim != [] else [0], + 'dim': dim if dim is not None and dim != [] else [0], 'keep_dim': keep_dim, 'reduce_all': True - if dim == None or dim == [] or len(dim) == len(input.shape) + if dim is None or dim == [] or len(dim) == len(input.shape) else False, }, ) @@ -5306,17 +5306,17 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): dim = [dim] if in_dygraph_mode(): - return _C_ops.min(input, dim if dim != None else [], keep_dim) + return _C_ops.min(input, dim if dim is not None else [], keep_dim) helper.append_op( type='reduce_min', inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None and dim != [] else [0], + 'dim': dim if dim is not None and dim != [] else [0], 'keep_dim': keep_dim, 'reduce_all': True - if dim == None or dim == [] or len(dim) == len(input.shape) + if dim is None or dim == [] or len(dim) == len(input.shape) else False, }, ) @@ -5387,10 +5387,10 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): if in_dygraph_mode(): return _C_ops.prod( input, - dim if dim != None and dim != [] else [0], + dim if dim is not None and dim != [] else [0], keep_dim, True - if dim == None or dim == [] or len(dim) == len(input.shape) + if dim is None or dim == [] or len(dim) == len(input.shape) else False, ) @@ -5404,10 +5404,10 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None and dim != [] else [0], + 'dim': dim if dim is not None and dim != [] else [0], 'keep_dim': keep_dim, 'reduce_all': True - if dim == None or dim == [] or len(dim) == len(input.shape) + if dim is None or dim == [] or len(dim) == len(input.shape) else False, }, ) @@ -5462,7 +5462,7 @@ def reduce_all(input, dim=None, keep_dim=False, name=None): dim = [dim] if in_dygraph_mode(): - return _C_ops.all(input, dim if dim != None else [], keep_dim) + return _C_ops.all(input, dim if dim is not None else [], keep_dim) check_variable_and_dtype(input, 'input', ('bool'), 'reduce_all') helper = LayerHelper('reduce_all', **locals()) @@ -5472,10 +5472,10 @@ def reduce_all(input, dim=None, keep_dim=False, name=None): inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None and dim != [] else [0], + 'dim': dim if dim is not None and dim != [] else [0], 'keep_dim': keep_dim, 'reduce_all': True - if dim == None or dim == [] or len(dim) == len(input.shape) + if dim is None or dim == [] or len(dim) == len(input.shape) else False, }, ) @@ -5535,10 +5535,10 @@ def reduce_any(input, dim=None, keep_dim=False, name=None): inputs={'X': input}, outputs={'Out': out}, attrs={ - 'dim': dim if dim != None and dim != [] else [0], + 'dim': dim if dim is not None and dim != [] else [0], 'keep_dim': keep_dim, 'reduce_all': True - if dim == None or dim == [] or len(dim) == len(input.shape) + if dim is None or dim == [] or len(dim) == len(input.shape) else False, }, ) @@ -11386,7 +11386,7 @@ def unstack(x, axis=0, num=None): """ if _non_static_mode(): - if num == None: + if num is None: num = x.shape[axis] if num == 0: return [] diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 8f15508e699c1..431e7a1481a8d 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -99,7 +99,7 @@ def __init__(self, name): The MetricBase or its succeed classes """ - self._name = str(name) if name != None else self.__class__.__name__ + self._name = str(name) if name is not None else self.__class__.__name__ def __str__(self): return self._name diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py index cf5cbf60ea3d8..585ff39a82efd 100644 --- a/python/paddle/fluid/net_drawer.py +++ b/python/paddle/fluid/net_drawer.py @@ -114,7 +114,7 @@ def draw_graph(startup_program, main_program, **kwargs): graph_id = unique_id() filename = kwargs.get("filename") - if filename == None: + if filename is None: filename = str(graph_id) + ".gv" g = Graph( name=str(graph_id), @@ -129,6 +129,6 @@ def draw_graph(startup_program, main_program, **kwargs): parse_graph(startup_program, g, var_dict) parse_graph(main_program, g, var_dict) - if filename != None: + if filename is not None: g.save() return g diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f06c380838f15..b347aa48da26d 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -707,7 +707,7 @@ def _add_accumulator( name, param.name ) ) - if shape == None: + if shape is None: shape = param.shape assert isinstance(self.helper, LayerHelper) @@ -770,7 +770,7 @@ def _add_global_accumulator( if framework._non_static_mode(): return self._global_accumulators[name] raise Exception("Global accumulator {} already exists".format(name)) - if shape == None: + if shape is None: shape = [1] # most case, global accumulator is of shape [1] assert isinstance(self.helper, LayerHelper) @@ -1268,7 +1268,7 @@ def apply_gradients(self, params_grads): # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization. if self._flatten_param_grads and self.regularization is None: - if self._grad_clip == None or isinstance( + if self._grad_clip is None or isinstance( self._grad_clip, ClipGradByGlobalNorm ): params_grads = self.flatten_param_grads(params_grads) @@ -3344,7 +3344,7 @@ def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) # create the dpsgd optimize op - if self._seed == None: + if self._seed is None: self._seed = 0 if framework._non_static_mode(): @@ -4454,10 +4454,10 @@ def _add_average_apply_op(self, block, param_grad): tmp = layers.sum(x=[num_accumulates, old_num_accumulates]) sum = layers.sum(x=[sum_1, sum_2, sum_3]) tmp = layers.cast( - x=tmp, dtype='float32' if self._dtype == None else self._dtype + x=tmp, dtype='float32' if self._dtype is None else self._dtype ) sum = layers.cast( - x=sum, dtype='float32' if self._dtype == None else self._dtype + x=sum, dtype='float32' if self._dtype is None else self._dtype ) ops._elementwise_div(x=sum, y=tmp, out=param) @@ -5254,7 +5254,7 @@ def _find_post_op(self, index, var_name): var_name = var_name.replace('.cast_fp16', '') post_ops = self.input_var_to_op[var_name] - if post_ops == None: + if post_ops is None: return None result_op = None for post_op, post_idx in reversed(post_ops): @@ -5269,7 +5269,7 @@ def _find_prev_op(self, index, var_name): variable named var_name. """ prev_ops = self.output_var_to_op[var_name] - if prev_ops == None: + if prev_ops is None: return None result_op = None for prev_op, prev_idx in reversed(prev_ops): @@ -7270,7 +7270,7 @@ def _parse_forward(self): if output_var in self.un_offload_checkpoint_names: # insert sync op if last checkpoint has not been sync - if last_offload_checkpoint != None: + if last_offload_checkpoint is not None: if ( self.checkpoint_usage_count_and_idx[ last_offload_checkpoint @@ -7400,7 +7400,7 @@ def _offload(self, loss, startup_program=None): """ self._main_program = loss.block.program self.block = loss.block - if startup_program == None: + if startup_program is None: startup_program = paddle.static.default_startup_program() with program_guard(self._main_program, startup_program): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py index 33f98c219d84a..195857b8a7627 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py @@ -66,7 +66,7 @@ def tearDownClass(cls): def custom_raw_relu(self, x): module = importlib.import_module(MODULE_NAME) custom_raw_relu_op = getattr(module, "custom_raw_relu") - self.assertTrue(custom_raw_relu_op is not None) + self.assertIsNotNone(custom_raw_relu_op) return custom_raw_relu_op(x) def test_static(self): diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py index 389645139c578..3e352c816bbc6 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py @@ -139,9 +139,9 @@ def test_decorate(self): name, None ) if ASPHelper._is_supported_layer(program, name): - self.assertTrue(mask_var is not None) + self.assertIsNotNone(mask_var) else: - self.assertTrue(mask_var is None) + self.assertIsNone(mask_var) def test_asp_training(self): self.optimizer = paddle.incubate.asp.decorate(self.optimizer) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py index 54b11c4934cd0..cbe899a7e6eb2 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py @@ -23,7 +23,7 @@ def test_default_config(self): recompute = strategy.recompute self.assertEqual(recompute.enable, False) - self.assertEqual(recompute.checkpoints, None) + self.assertIsNone(recompute.checkpoints) amp = strategy.amp self.assertEqual(amp.enable, False) @@ -59,12 +59,12 @@ def test_default_config(self): self.assertEqual(qat.weight_bits, 8) self.assertEqual(qat.activation_bits, 8) self.assertEqual(qat.not_quant_pattern, ['skip_quant']) - self.assertEqual(qat.algo, None) + self.assertIsNone(qat.algo) tuning = strategy.tuning self.assertEqual(tuning.enable, False) self.assertEqual(tuning.batch_size, 1) - self.assertEqual(tuning.dataset, None) + self.assertIsNone(tuning.dataset) self.assertEqual(tuning.profile_start_step, 1) self.assertEqual(tuning.profile_end_step, 1) self.assertEqual(tuning.run_after_tuning, True) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py index 4bb9272748bf2..4a6df10ec546c 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py @@ -395,7 +395,7 @@ def test_partitioner(self): # test fill_constant_batch_size_like - self.assertTrue(fill_op is not None) + self.assertIsNotNone(fill_op) ref_shape = [-1, 8, 0, 48] shape = fill_op.attr("shape") self.assertTrue(ref_shape == shape) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint.py index cb61e2c9a8ab2..4d4ed5e488fc7 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint.py @@ -71,15 +71,15 @@ def _run_normal(self): exe, main_prog, startup_prog ) for i in range(3): - self.assertEqual(acp._get_train_epoch_range(), None) - self.assertEqual(acp.g_acp_type, None) + self.assertIsNone(acp._get_train_epoch_range()) + self.assertIsNone(acp.g_acp_type) for data in data_loader(): - self.assertEqual(acp.g_acp_type, None) - self.assertEqual(acp._get_train_epoch_range(), None) + self.assertIsNone(acp.g_acp_type) + self.assertIsNone(acp._get_train_epoch_range()) fetch = exe.run(compiled, feed=data, fetch_list=[loss]) - self.assertEqual(acp.g_acp_type, None) - self.assertEqual(acp._get_train_epoch_range(), None) + self.assertIsNone(acp.g_acp_type) + self.assertIsNone(acp._get_train_epoch_range()) m1 = PaddleModel(exe, compiled) m1.serialize(save_dir) @@ -136,7 +136,7 @@ def _run_save_0(self, break_epoch_no=None): break o = acp._get_train_epoch_range() - assert o == None, "now train epoch must not exits now" + assert o is None, "now train epoch must not exits now" if break_epoch_no is None: self.assertEqual(i, 2) else: @@ -169,7 +169,7 @@ def _run_load_0(self, break_epoch_no=None): fetch = exe.run(compiled, feed=data, fetch_list=[loss]) o = acp._get_train_epoch_range() - self.assertTrue(o == None, "now train epoch must not exits now") + self.assertTrue(o is None, "now train epoch must not exits now") self.assertEqual(i, 2) if break_epoch_no is not None: diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_dist_basic.py index 702c3eb24a3fc..7c0d444acc807 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_dist_basic.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_dist_basic.py @@ -98,7 +98,7 @@ def test_distributed_basic(self): self.assertEqual(len(o._exe_status), 1) o = acp._get_train_epoch_range() - assert o == None, "now train epoch must not exits now" + assert o is None, "now train epoch must not exits now" self.assertEqual(i, 2) fs.delete(save_dir) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_multiple.py index 6bb59c5d2aa0c..82bee87b55f8f 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_multiple.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_auto_checkpoint_multiple.py @@ -94,7 +94,7 @@ def test_multiple(self): epochs.append(i) o = acp._get_train_epoch_range() - self.assertTrue(o == None, "now train epoch must not exits now") + self.assertTrue(o is None, "now train epoch must not exits now") self.assertEqual(i, 2) self.assertEqual(epochs, [0, 1, 2]) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_rolemaker_new.py index 18c2487b66a95..4152dbf3f8151 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_rolemaker_new.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_rolemaker_new.py @@ -45,8 +45,8 @@ def test_rolemaker_base(self): self.assertTrue(len(pserver_endpoints) == 0) print(role.to_string()) - self.assertTrue(role._all_gather(1, "worker") is None) - self.assertTrue(role._all_reduce(1, "sum", "worker") is None) + self.assertIsNone(role._all_gather(1, "worker")) + self.assertIsNone(role._all_reduce(1, "sum", "worker")) role._barrier("worker") diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py index b141b1ed65bc6..0b1f7b34b317f 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py @@ -68,7 +68,7 @@ def test_is_dir(self): """ # fmt: off, avoid remove tabs in string print("split lines:", s.splitlines()) - self.assertTrue(fs._test_match(s.splitlines()) != None) + self.assertIsNotNone(fs._test_match(s.splitlines())) def test_config(self): config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"} diff --git a/python/paddle/fluid/tests/unittests/collective/init_process_group.py b/python/paddle/fluid/tests/unittests/collective/init_process_group.py index f45e4004831ef..2cdd029b90c8d 100644 --- a/python/paddle/fluid/tests/unittests/collective/init_process_group.py +++ b/python/paddle/fluid/tests/unittests/collective/init_process_group.py @@ -30,11 +30,11 @@ def test_init_process_group(self): paddle.distributed.init_parallel_env() paddle.distributed.new_group() group = paddle.distributed.new_group([-1, -2]) - assert group.process_group == None + assert group.process_group is None group = paddle.distributed.collective.Group(-1, 2, 0, [-1, -2]) ret = paddle.distributed.barrier(group) - assert ret == None + assert ret is None paddle.enable_static() in_tensor = paddle.empty((1, 2)) in_tensor2 = paddle.empty((1, 2)) diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py index 417ff66e0cb7e..eece2c32f8c88 100644 --- a/python/paddle/fluid/tests/unittests/dist_text_classification.py +++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py @@ -167,7 +167,7 @@ def tokenize(pattern): # tarfile.extractfile, which does random access and might # destroy hard disks. tf = tarf.next() - while tf != None: + while tf is not None: if bool(pattern.match(tf.name)): # newline and punctuations removal and ad-hoc tokenization. yield tarf.extractfile(tf).read().rstrip(b'\n\r').translate( diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py index 5f62faeb4228f..2b19e974169d3 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py @@ -101,7 +101,7 @@ def check_results(self, no_pass_rets, pass_rets): zip(no_pass_ret, pass_ret) ): if out_var_no_pass is None: - self.assertTrue(out_var_pass is None) + self.assertIsNone(out_var_pass) else: np.testing.assert_allclose( out_var_no_pass, diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 266bcf4e7b786..3e5aae4d3110c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -246,7 +246,7 @@ def forward(self, inputs): enc_new_hidden, enc_new_cell = self.enc_units[i]( enc_step_input, enc_hidden[i], enc_cell[i] ) - if self.dropout != None and self.dropout > 0.0: + if self.dropout is not None and self.dropout > 0.0: enc_step_input = fluid.layers.dropout( enc_new_hidden, dropout_prob=self.dropout, @@ -278,7 +278,7 @@ def forward(self, inputs): ) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) - if self.dropout != None and self.dropout > 0.0: + if self.dropout is not None and self.dropout > 0.0: step_input = fluid.layers.dropout( new_hidden, dropout_prob=self.dropout, @@ -346,7 +346,7 @@ def beam_search(self, inputs): enc_new_hidden, enc_new_cell = self.enc_units[i]( enc_step_input, enc_hidden[i], enc_cell[i] ) - if self.dropout != None and self.dropout > 0.0: + if self.dropout is not None and self.dropout > 0.0: enc_step_input = fluid.layers.dropout( enc_new_hidden, dropout_prob=self.dropout, @@ -418,7 +418,7 @@ def beam_search(self, inputs): ) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) - if self.dropout != None and self.dropout > 0.0: + if self.dropout is not None and self.dropout > 0.0: step_input = fluid.layers.dropout( new_hidden, dropout_prob=self.dropout, @@ -760,7 +760,7 @@ def forward(self, inputs): enc_new_hidden, enc_new_cell = self.enc_units[i]( enc_step_input, enc_hidden[i], enc_cell[i] ) - if self.dropout != None and self.dropout > 0.0: + if self.dropout is not None and self.dropout > 0.0: enc_step_input = fluid.layers.dropout( enc_new_hidden, dropout_prob=self.dropout, @@ -803,7 +803,7 @@ def forward(self, inputs): ) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) - if self.dropout != None and self.dropout > 0.0: + if self.dropout is not None and self.dropout > 0.0: step_input = fluid.layers.dropout( new_hidden, dropout_prob=self.dropout, diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py index 2b73a1075e5a3..13fd569c920b4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py @@ -73,7 +73,7 @@ def inner_deco(*args, **kwargs): return inner_deco - if func == None: + if func is None: return decorated return decorated(func) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py index 37bc3f5dc12c7..2f0672f7185d1 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_function_spec.py @@ -29,7 +29,7 @@ def test_constructor(self): args_name = foo_spec.args_name self.assertListEqual(args_name, ['a', 'b', 'c', 'd']) self.assertTrue(foo_spec.dygraph_function == foo_func) - self.assertTrue(foo_spec.input_spec is None) + self.assertIsNone(foo_spec.input_spec) def test_verify_input_spec(self): a_spec = InputSpec([None, 10], name='a') diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py index 58e8e7b6728f5..bd41468560473 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py @@ -21,7 +21,7 @@ def test_place(self): paddle.enable_static() x = paddle.to_tensor([1, 2, 3, 4]) - self.assertTrue(x.place() == None) + self.assertIsNone(x.place()) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py index 88c2024cff151..5a7ca3bc204ed 100644 --- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py +++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py @@ -89,7 +89,7 @@ def check_network_convergence( first_loss, last_loss = None, None step_id = 0 custom_iter = getattr(self, "iter", None) - if not custom_iter == None: + if custom_iter is not None: iter = custom_iter for data in reader(): ret = exe.run(train_cp, feed=data, fetch_list=fetch_list) diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py index 9c12d5977d42b..ef1051c377a9e 100644 --- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py @@ -43,17 +43,17 @@ def adaptive_pool2d_forward( else [x.shape[3], x.shape[1], x.shape[2]] ) - if isinstance(output_size, int) or output_size == None: + if isinstance(output_size, int) or output_size is None: H_out = output_size W_out = output_size output_size = [H_out, W_out] else: H_out, W_out = output_size - if output_size[0] == None: + if output_size[0] is None: output_size[0] = H H_out = H - if output_size[1] == None: + if output_size[1] is None: output_size[1] = W W_out = W diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py index e7a8685a8f164..065a27c90e80d 100755 --- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py @@ -39,7 +39,7 @@ def adaptive_pool3d_forward( else [x.shape[4], x.shape[1], x.shape[2], x.shape[3]] ) - if isinstance(output_size, int) or output_size == None: + if isinstance(output_size, int) or output_size is None: H_out = output_size W_out = output_size D_out = output_size @@ -47,13 +47,13 @@ def adaptive_pool3d_forward( else: D_out, H_out, W_out = output_size - if output_size[0] == None: + if output_size[0] is None: output_size[0] = D D_out = D - if output_size[1] == None: + if output_size[1] is None: output_size[1] = H H_out = H - if output_size[2] == None: + if output_size[2] is None: output_size[2] = W W_out = W diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py index 9884cb6c90da8..decbfbfa012ec 100644 --- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py @@ -41,17 +41,17 @@ def adaptive_pool2d_forward( else [x.shape[3], x.shape[1], x.shape[2]] ) - if isinstance(output_size, int) or output_size == None: + if isinstance(output_size, int) or output_size is None: H_out = output_size W_out = output_size output_size = [H_out, W_out] else: H_out, W_out = output_size - if output_size[0] == None: + if output_size[0] is None: output_size[0] = H H_out = H - if output_size[1] == None: + if output_size[1] is None: output_size[1] = W W_out = W diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py index e90c406198038..21400576c4752 100755 --- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py @@ -41,7 +41,7 @@ def adaptive_pool3d_forward( else [x.shape[4], x.shape[1], x.shape[2], x.shape[3]] ) - if isinstance(output_size, int) or output_size == None: + if isinstance(output_size, int) or output_size is None: H_out = output_size W_out = output_size D_out = output_size @@ -49,13 +49,13 @@ def adaptive_pool3d_forward( else: D_out, H_out, W_out = output_size - if output_size[0] == None: + if output_size[0] is None: output_size[0] = D D_out = D - if output_size[1] == None: + if output_size[1] is None: output_size[1] = H H_out = H - if output_size[2] == None: + if output_size[2] is None: output_size[2] = W W_out = W diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index e18a585d33f38..62ab2124a5e15 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -602,7 +602,7 @@ def test_mapper_misc(self): outputs={'Out': output}, ) self.assertEqual(get_comm_volume(broadcast_op, 0, 1), 400) - self.assertEqual(get_comm_volume(broadcast_op, 1, 0), None) + self.assertIsNone(get_comm_volume(broadcast_op, 1, 0)) allgather_op = train_program.global_block().append_op( type="c_allgather", inputs={'X': input}, @@ -610,14 +610,14 @@ def test_mapper_misc(self): outputs={'Out': output}, ) self.assertEqual(get_comm_volume(allgather_op, 0, 1), 400) - self.assertEqual(get_comm_volume(allgather_op, 0, 0), None) + self.assertIsNone(get_comm_volume(allgather_op, 0, 0)) reduce_op = train_program.global_block().append_op( type="c_reduce_sum", inputs={'X': input}, attrs={'ring_id': ring_id, 'root_id': root_id}, outputs={'Out': output}, ) - self.assertEqual(get_comm_volume(reduce_op, 0, 1), None) + self.assertIsNone(get_comm_volume(reduce_op, 0, 1)) self.assertEqual(get_comm_volume(reduce_op, 1, 0), 400) cast_op = train_program.global_block().append_op( type="cast", diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py index 852c6ab74b128..f235d136cd67d 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py @@ -53,26 +53,26 @@ def is_valid_completed_program(dist_context, program): vars_ = program.list_vars() for op in ops: op_dist_attrs = dist_context.get_op_dist_attr_for_program(op) - if op_dist_attrs == None: + if op_dist_attrs is None: return False - if op_dist_attrs.process_mesh == None: + if op_dist_attrs.process_mesh is None: return False for tensor_dist_attr in op_dist_attrs.inputs_dist_attrs.values(): - if None == tensor_dist_attr.dims_mapping: + if tensor_dist_attr.dims_mapping is None: return False for tensor_dist_attr in op_dist_attrs.outputs_dist_attrs.values(): - if None == tensor_dist_attr.dims_mapping: + if tensor_dist_attr.dims_mapping is None: return False for var in vars_: var_dist_attrs = dist_context.get_tensor_dist_attr_for_program(var) - if var_dist_attrs == None: + if var_dist_attrs is None: return False - elif var_dist_attrs.process_mesh == None: + elif var_dist_attrs.process_mesh is None: return False - elif var_dist_attrs.dims_mapping == None: + elif var_dist_attrs.dims_mapping is None: return False return True diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index 66cf1b489f6e6..fda58617373f4 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -606,7 +606,7 @@ def func_test_to_api_none_buffer(self): buffer = None model.register_buffer("buf_name", buffer, persistable=True) model.to(dtype='float64') - self.assertEqual(model._buffers['buf_name'], None) + self.assertIsNone(model._buffers['buf_name']) def test_main(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py index 107792a892de3..90c2e84d0eef6 100644 --- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py @@ -106,9 +106,9 @@ def fake_reader(): break if break_beforehand: - self.assertTrue(next(gen, None) is not None) + self.assertIsNotNone(next(gen, None)) else: - self.assertTrue(next(gen, None) is None) + self.assertIsNone(next(gen, None)) class TestClass2(TestClass): diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py index b865a1b179933..b8968d3fdd01c 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py @@ -67,7 +67,7 @@ def spectral_normalize(self, weight, u, v, dim, power_iters, eps): def test_check_output(self): linear = paddle.nn.Conv2D(2, 1, 3) before_weight = linear.weight.numpy().copy() - if self.dim == None: + if self.dim is None: if isinstance( linear, ( diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py index ebae140f88c25..1a3e4a2e7b510 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py @@ -122,7 +122,7 @@ def test_check_output(self): fluid.enable_imperative() linear = paddle.nn.Conv2D(2, 3, 3) before_weight = linear.weight.numpy() - if self.dim == None: + if self.dim is None: self.dim = -1 if self.dim != -1: diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py index eebc321b1a8f6..684322c529265 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py @@ -102,7 +102,7 @@ def assertScopeVar(self, scope, persitables, non_persistables): outline_p_vars = [] for name in persitables: var = scope.find_var(name) - self.assertTrue(var is not None) + self.assertIsNotNone(var) t = var.get_tensor() if not t._is_initialized(): outline_p_vars.append(name) @@ -110,7 +110,7 @@ def assertScopeVar(self, scope, persitables, non_persistables): outline_np_vars = [] for name in non_persistables: var = scope.find_var(name) - self.assertTrue(var is not None) + self.assertIsNotNone(var) t = var.get_tensor() if t._is_initialized(): outline_np_vars.append(name) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py index e48a8056d030f..622e36abd80b6 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py @@ -204,7 +204,7 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): input = m - if dropout != None and dropout > 0.0: + if dropout is not None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, @@ -308,7 +308,7 @@ def encoder_static( cell_array[k] = c input = m - if dropout != None and dropout > 0.0: + if dropout is not None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, @@ -390,7 +390,7 @@ def encoder_static( x_emb = layers.reshape( x_emb, shape=[-1, num_steps, hidden_size], inplace=True ) - if dropout != None and dropout > 0.0: + if dropout is not None and dropout > 0.0: x_emb = layers.dropout( x_emb, dropout_prob=dropout, diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py index 49dd5d6928bfd..686c7fa1ef75a 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py @@ -111,7 +111,7 @@ def test_is_server(self): def test_util(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) - self.assertNotEqual(fleet.util, None) + self.assertIsNotNone(fleet.util) def test_barrier_worker(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py index ff12ef64b5f61..3dbe084960fd9 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_util.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py @@ -50,7 +50,7 @@ def test_util_factory(self): context["role_maker"] = role_maker context["valid_strategy"] = strategy util = factory._create_util(context) - self.assertEqual(util.role_maker, None) + self.assertIsNone(util.role_maker) def test_get_util(self): import paddle.distributed.fleet as fleet @@ -58,7 +58,7 @@ def test_get_util(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) - self.assertNotEqual(fleet.util, None) + self.assertIsNotNone(fleet.util) def test_set_user_defined_util(self): import paddle.distributed.fleet as fleet diff --git a/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py index a4d152c75b997..53ef3610d6fde 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py +++ b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py @@ -112,7 +112,7 @@ def rand_test_base(self, m, n, k, trans_x, trans_y, need_bias, dtype, seed): if need_bias: np.testing.assert_array_equal(bias.grad.numpy(), bias_grad_np) else: - self.assertTrue(bias_grad_np is None) + self.assertIsNone(bias_grad_np) def rand_test(self, m, n, k, dtype): seed = int(np.random.randint(low=0, high=1000, size=[1])) diff --git a/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py b/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py index 3394a08de8b19..9e32b43ad42f8 100644 --- a/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py +++ b/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py @@ -36,7 +36,7 @@ def test_main(self): self.assertTrue(var.name in g.keys()) value1 = g[var.name] value2 = g.get(var.name, None) - self.assertTrue(value1 is not None) + self.assertIsNotNone(value1) self.assertEqual(value1, value2) self.assertEqual(type(value1), var.type) self.assertEqual(type(value2), var.type) @@ -53,7 +53,7 @@ def test_main(self): name = "__any_non_exist_name__" self.assertFalse(name in g) self.assertFalse(name in g.keys()) - self.assertTrue(g.get(name, None) is None) + self.assertIsNone(g.get(name, None)) self.assertEquals(g.get(name, -1), -1) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index 3cda7d5d216fd..ba533bf720a6d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -162,8 +162,8 @@ def func_auto_prune(self): v2 = fluid.dygraph.to_variable(value2) loss = case1(v1, v2) loss.backward() - self.assertTrue(case1.linear2.weight._grad_ivar() is not None) - self.assertTrue(case1.linear1.weight._grad_ivar() is not None) + self.assertIsNotNone(case1.linear2.weight._grad_ivar()) + self.assertIsNotNone(case1.linear1.weight._grad_ivar()) def test_auto_prune(self): with _test_eager_guard(): @@ -180,8 +180,8 @@ def func_auto_prune2(self): loss = case2(v1, v2) loss.backward() - self.assertTrue(case2.linear2.weight._grad_ivar() is None) - self.assertTrue(case2.linear1.weight._grad_ivar() is not None) + self.assertIsNone(case2.linear2.weight._grad_ivar()) + self.assertIsNotNone(case2.linear1.weight._grad_ivar()) def test_auto_prune2(self): with _test_eager_guard(): @@ -198,7 +198,7 @@ def func_auto_prune3(self): v2 = fluid.dygraph.to_variable(value2) loss, part2 = case3(v1, v2, 1) loss.backward() - self.assertTrue(case3.linear.weight._grad_ivar() is not None) + self.assertIsNotNone(case3.linear.weight._grad_ivar()) self.assertTrue((part2.gradient() == 0).all()) def test_auto_prune3(self): @@ -217,7 +217,7 @@ def func_auto_prune4(self): v2 = fluid.dygraph.to_variable(value2) loss, part2 = case4(v1, v2, 1) part2.backward() - self.assertTrue(case4.linear.weight._grad_ivar() is not None) + self.assertIsNotNone(case4.linear.weight._grad_ivar()) self.assertTrue((part2.gradient() == 1).all()) def test_auto_prune4(self): @@ -236,7 +236,7 @@ def func_auto_prune5(self): v2 = fluid.dygraph.to_variable(value2) loss, part1, part2 = case4(v1, v2, 2) part1.backward() - self.assertTrue(case4.linear.weight._grad_ivar() is not None) + self.assertIsNotNone(case4.linear.weight._grad_ivar()) self.assertTrue((part2.gradient() == 0).all()) def test_auto_prune5(self): @@ -261,8 +261,8 @@ def func_auto_prune6(self): out1.stop_gradient = True out = fluid.layers.concat(input=[out1, out2, c], axis=1) out.backward() - self.assertTrue(linear.weight.gradient() is None) - self.assertTrue(out1.gradient() is None) + self.assertIsNone(linear.weight.gradient()) + self.assertIsNone(out1.gradient()) def test_auto_prune6(self): with _test_eager_guard(): @@ -284,8 +284,8 @@ def func_auto_prune7(self): out1.stop_gradient = True out = fluid.layers.concat(input=[out1, out2, c], axis=1) out.backward() - self.assertTrue(linear.weight.gradient() is None) - self.assertTrue(out1.gradient() is None) + self.assertIsNone(linear.weight.gradient()) + self.assertIsNone(out1.gradient()) def test_auto_prune7(self): with _test_eager_guard(): @@ -377,8 +377,8 @@ def func_auto_prune10(self): # TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore. fluid.set_flags({'FLAGS_sort_sum_gradient': True}) out.backward() - self.assertTrue(linear.weight.gradient() is None) - self.assertTrue(out1.gradient() is None) + self.assertIsNone(linear.weight.gradient()) + self.assertIsNone(out1.gradient()) def test_auto_prune10(self): with _test_eager_guard(): @@ -449,8 +449,8 @@ def func_case2_prune_no_grad_branch(self): case3 = AutoPruneLayer2(input_size=784) loss = case3(v1, v2) loss.backward() - self.assertTrue(case3.linear2.weight._grad_ivar() is None) - self.assertTrue(case3.linear.weight._grad_ivar() is not None) + self.assertIsNone(case3.linear2.weight._grad_ivar()) + self.assertIsNotNone(case3.linear.weight._grad_ivar()) def test_case2_prune_no_grad_branch(self): with _test_eager_guard(): @@ -468,7 +468,7 @@ def func_case3_prune_no_grad_branch2(self): out = fluid.layers.one_hot(input=label, depth=100) loss = paddle.mean(out) loss.backward() - self.assertTrue(linear.weight._grad_ivar() is None) + self.assertIsNone(linear.weight._grad_ivar()) def test_case3_prune_no_grad_branch2(self): with _test_eager_guard(): @@ -480,7 +480,7 @@ def func_case4_with_no_grad_op_maker(self): out = fluid.layers.gaussian_random(shape=[20, 30]) loss = paddle.mean(out) loss.backward() - self.assertTrue(out._grad_ivar() is None) + self.assertIsNone(out._grad_ivar()) def test_case4_with_no_grad_op_maker(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 62afffb59abca..72f0a599385a5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -272,7 +272,7 @@ def test_no_grad_guard(self): data = np.array([[2, 3], [4, 5]]).astype('float32') with fluid.dygraph.guard(): l0 = fluid.Linear(2, 2) - self.assertTrue(l0.weight._grad_ivar() is None) + self.assertIsNone(l0.weight._grad_ivar()) l1 = fluid.Linear(2, 2) with fluid.dygraph.no_grad(): self.assertTrue(l1.weight.stop_gradient is False) @@ -283,14 +283,14 @@ def test_no_grad_guard(self): o = l1(y) o.backward() - self.assertTrue(tmp._grad_ivar() is None) - self.assertTrue(l0.weight._grad_ivar() is not None) + self.assertIsNone(tmp._grad_ivar()) + self.assertIsNotNone(l0.weight._grad_ivar()) def test_paddle_imperative_no_grad_guard(self): data = np.array([[2, 3], [4, 5]]).astype('float32') with fluid.dygraph.guard(): l0 = fluid.Linear(2, 2) - self.assertTrue(l0.weight._grad_ivar() is None) + self.assertIsNone(l0.weight._grad_ivar()) l1 = fluid.Linear(2, 2) with paddle.no_grad(): self.assertTrue(l1.weight.stop_gradient is False) @@ -301,14 +301,14 @@ def test_paddle_imperative_no_grad_guard(self): o = l1(y) o.backward() - self.assertTrue(tmp._grad_ivar() is None) - self.assertTrue(l0.weight._grad_ivar() is not None) + self.assertIsNone(tmp._grad_ivar()) + self.assertIsNotNone(l0.weight._grad_ivar()) def test_paddle_imperative_set_grad_enabled(self): data = np.array([[2, 3], [4, 5]]).astype('float32') with fluid.dygraph.guard(): l0 = fluid.Linear(2, 2) - self.assertTrue(l0.weight._grad_ivar() is None) + self.assertIsNone(l0.weight._grad_ivar()) l1 = fluid.Linear(2, 2) with paddle.set_grad_enabled(False): self.assertTrue(l1.weight.stop_gradient is False) @@ -322,9 +322,9 @@ def test_paddle_imperative_set_grad_enabled(self): o = l1(y) o.backward() - self.assertTrue(tmp._grad_ivar() is None) - self.assertTrue(tmp2._grad_ivar() is not None) - self.assertTrue(l0.weight._grad_ivar() is not None) + self.assertIsNone(tmp._grad_ivar()) + self.assertIsNotNone(tmp2._grad_ivar()) + self.assertIsNotNone(l0.weight._grad_ivar()) def test_paddle_imperative_is_grad_enabled(self): with fluid.dygraph.guard(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index 140f1191c9644..f121bacb2a585 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -83,7 +83,7 @@ def func_simple_example_eager_grad_allow_unused(self): # stop_gradient = !create_graph, create_graph default false self.assertEqual(dx[0].stop_gradient, True) # x is unused input in the graph - self.assertEqual(dx[1], None) + self.assertIsNone(dx[1]) def test_simple_example_eager_grad_allow_unused(self): with _test_eager_guard(): @@ -292,7 +292,7 @@ def func_simple_example(self): (none_grad,) = self.grad( [x], [y], create_graph=create_graph, allow_unused=True ) - self.assertTrue(none_grad is None) + self.assertIsNone(none_grad) (grad_with_none_and_not_none,) = self.grad( [x, y], [y], create_graph=create_graph diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 0219cc9947975..c1e6c3b3b3a93 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -84,7 +84,7 @@ def _reader_imple(): def _check_exception(self, exception_message, place=None): seed = 90 batch_size = 128 - if place == None: + if place is None: place = ( fluid.CUDAPlace(0) if core.is_compiled_with_cuda() @@ -106,7 +106,7 @@ def _check_mlp(self, place=None): seed = 90 batch_size = 128 - if place == None: + if place is None: place = ( fluid.CPUPlace() if not core.is_compiled_with_cuda() @@ -161,7 +161,7 @@ def _check_mlp(self, place=None): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) - if place == None: + if place is None: place = ( fluid.CPUPlace() if not core.is_compiled_with_cuda() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py index 251ac5b6b9338..c531374478bda 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -81,7 +81,7 @@ def _reader_imple(): def _check_exception(self, exception_message, place=None): seed = 90 batch_size = 128 - if place == None: + if place is None: place = ( fluid.CUDAPlace(0) if core.is_compiled_with_cuda() @@ -105,7 +105,7 @@ def _check_mlp(self, place=None): seed = 90 batch_size = 128 - if place == None: + if place is None: place = ( fluid.CPUPlace() if not core.is_compiled_with_cuda() @@ -170,7 +170,7 @@ def _check_mlp(self, place=None): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) - if place == None: + if place is None: place = ( fluid.CPUPlace() if not core.is_compiled_with_cuda() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 55cc00b12ab91..10c69cbc43d13 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -1003,7 +1003,7 @@ def func_testOnlyLoadParams(self): os.path.join('saved_dy', 'emb_dy') ) - self.assertTrue(opti_state_dict == None) + self.assertIsNone(opti_state_dict) para_state_dict, opti_state_dict = fluid.load_dygraph( os.path.join('saved_dy', 'emb_dy.pdparams') @@ -1022,8 +1022,8 @@ def func_test_load_compatible_with_keep_name_table(self): para_state_dict, opti_state_dict = fluid.load_dygraph( os.path.join('saved_dy', 'emb_dy'), keep_name_table=True ) - self.assertTrue(para_state_dict != None) - self.assertTrue(opti_state_dict == None) + self.assertIsNotNone(para_state_dict) + self.assertIsNone(opti_state_dict) def test_main(self): self.func_setUp() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index 93a04a36115c6..3b5cab4ce977a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -62,18 +62,18 @@ def func_selectedrows_gradient1(self): ) # grad_clip=grad_clip input_emb, emb = simplenet(input) - self.assertTrue(emb.weight.gradient() is None) - self.assertTrue(input_emb.gradient() is None) + self.assertIsNone(emb.weight.gradient()) + self.assertIsNone(input_emb.gradient()) input_emb.backward() adam.minimize(input_emb) - self.assertTrue(emb.weight.gradient() is not None) + self.assertIsNotNone(emb.weight.gradient()) emb.clear_gradients() - self.assertTrue(emb.weight.gradient() is None) + self.assertIsNone(emb.weight.gradient()) input_emb.clear_gradient() - self.assertTrue(input_emb.gradient() is not None) + self.assertIsNotNone(input_emb.gradient()) paddle.enable_static() def test_selectedrows_gradient1(self): @@ -107,18 +107,18 @@ def func_selectedrows_gradient2(self): ) input_emb, emb = simplenet(input) - self.assertTrue(emb.weight.gradient() is None) - self.assertTrue(input_emb.gradient() is None) + self.assertIsNone(emb.weight.gradient()) + self.assertIsNone(input_emb.gradient()) input_emb.backward() adam.minimize(input_emb) - self.assertTrue(emb.weight.gradient() is not None) + self.assertIsNotNone(emb.weight.gradient()) emb.clear_gradients() - self.assertTrue(emb.weight.gradient() is None) + self.assertIsNone(emb.weight.gradient()) input_emb.clear_gradient() - self.assertTrue(input_emb.gradient() is not None) + self.assertIsNotNone(input_emb.gradient()) def test_selectedrows_gradient2(self): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py index bf9ac53f99c53..4e3288fd03786 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py @@ -449,7 +449,7 @@ def test_serialize_program_and_persistables(self): self.assertTrue(isinstance(res2, bytes)) # test if variables in program is empty res = paddle.static.io._serialize_persistables(Program(), None) - self.assertEqual(res, None) + self.assertIsNone(res) self.assertRaises( TypeError, paddle.static.io.deserialize_persistables, diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py index 9d805d64e93cc..5f3670aa50f90 100644 --- a/python/paddle/fluid/tests/unittests/test_input_spec.py +++ b/python/paddle/fluid/tests/unittests/test_input_spec.py @@ -32,7 +32,7 @@ def test_default(self): self.assertEqual( tensor_spec.dtype, convert_np_dtype_to_dtype_('float32') ) - self.assertEqual(tensor_spec.name, None) + self.assertIsNone(tensor_spec.name) def test_from_tensor(self): x_bool = fluid.layers.fill_constant(shape=[1], dtype='bool', value=True) @@ -51,7 +51,7 @@ def test_from_numpy(self): x_np_spec.dtype, convert_np_dtype_to_dtype_(x_numpy.dtype) ) self.assertEqual(x_np_spec.shape, x_numpy.shape) - self.assertEqual(x_np_spec.name, None) + self.assertIsNone(x_np_spec.name) x_numpy2 = np.array([1, 2, 3, 4]).astype('int64') x_np_spec2 = InputSpec.from_numpy(x_numpy2, name='x_np_int64') diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py index 475866fb903e8..bb2aee8873984 100644 --- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py +++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py @@ -241,8 +241,8 @@ def get_parameter(var): ) return params[0].astype(np.float32) else: - self.assertTrue(params[0] is not None) - self.assertTrue(params[1] is None) + self.assertIsNotNone(params[0]) + self.assertIsNone(params[1]) params[0] = np.array(params[0]) return params[0] diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 14754e347f9b9..93c630888befb 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -502,7 +502,7 @@ def test_conv2d(self): bias_attr=False, ) dy_ret = conv2d(base.to_variable(images)) - self.assertTrue(conv2d.bias is None) + self.assertIsNone(conv2d.bias) images = np.ones([2, 3, 5, 5], dtype='float32') conv2d = nn.Conv2D( @@ -512,7 +512,7 @@ def test_conv2d(self): bias_attr=False, ) dy_ret = conv2d(base.to_variable(images)) - self.assertTrue(conv2d.bias is None) + self.assertIsNone(conv2d.bias) with self.static_graph(): # the input of Conv2D must be Variable. @@ -4359,8 +4359,8 @@ def test_linear_chain_crf(self): crf_decode = layers.crf_decoding( input=emission, param_attr=ParamAttr(name="crfw") ) - self.assertFalse(crf is None) - self.assertFalse(crf_decode is None) + self.assertIsNotNone(crf) + self.assertIsNotNone(crf_decode) return layers.chunk_eval( input=crf_decode, label=label, @@ -4386,8 +4386,8 @@ def test_linear_chain_crf_padding(self): crf_decode = layers.crf_decoding( input=emission, length=length, param_attr=ParamAttr(name="crfw") ) - self.assertFalse(crf is None) - self.assertFalse(crf_decode is None) + self.assertIsNotNone(crf) + self.assertIsNotNone(crf_decode) return layers.chunk_eval( input=crf_decode, label=label, diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index dd958243fade6..e6e0e50ac76d7 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -128,7 +128,7 @@ def func_simple_example(self): (none_grad,) = self.grad( [x], [y], create_graph=create_graph, allow_unused=True ) - self.assertTrue(none_grad is None) + self.assertIsNone(none_grad) (grad_with_none_and_not_none,) = self.grad( [x, y], [y], create_graph=create_graph diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index b7dd11f7b8a80..e888d3c09c895 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -205,8 +205,8 @@ def test_all_profiler(self): class TestProfilerAPIError(unittest.TestCase): def test_errors(self): options = utils.ProfilerOptions() - self.assertTrue(options['profile_path'] is None) - self.assertTrue(options['timeline_path'] is None) + self.assertIsNone(options['profile_path']) + self.assertIsNone(options['timeline_path']) options = options.with_state('All') self.assertTrue(options['state'] == 'All') diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py index becab9796dff3..eaa000e35823a 100644 --- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py +++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py @@ -187,12 +187,12 @@ def backward(ctx, dy1): for dtype in dtypes: input1 = paddle.randn([2, 3]) input1.stop_gradient = False - self.assertTrue(input1.grad is None) + self.assertIsNone(input1.grad) z = tanh.apply(input1, dtype) z = paddle.cast(z, "float32") z.sum().backward() - self.assertTrue(input1.grad is not None) + self.assertIsNotNone(input1.grad) def test_pylayer_dtype(self): with _test_eager_guard(): @@ -283,7 +283,7 @@ def backward(ctx, x1, y1, dy1): input1 = paddle.randn([2, 3]).astype("float64") z = tanh.apply(input1, paddle.tanh, paddle.square) z.mean().backward() - self.assertTrue(z.grad is None) + self.assertIsNone(z.grad) def test_pylayer_nograd(self): with _test_eager_guard(): @@ -472,7 +472,7 @@ def forward(self, data): layer = Layer() z = layer(data) z.backward() - self.assertTrue(data.grad is not None) + self.assertIsNotNone(data.grad) def test_pylayer_inplace(self): with _test_eager_guard(): @@ -547,7 +547,7 @@ def forward(self, data): layer = Layer() z = layer(data) z.backward() - self.assertTrue(data.grad is not None) + self.assertIsNotNone(data.grad) def test_pylayer_inplace_backward_success_2(self): with _test_eager_guard(): @@ -580,7 +580,7 @@ def forward(self, data): layer = Layer() z = layer(data) z.backward() - self.assertTrue(data.grad is not None) + self.assertIsNotNone(data.grad) def func_test_pylayer_inplace_and_leaf_exception(self): class cus_pylayer_op( @@ -630,7 +630,7 @@ def backward(ctx, dy): temp.stop_gradient = False z = paddle.tanh(temp) z.backward() - self.assertTrue(temp.grad is not None) + self.assertIsNotNone(temp.grad) return paddle.to_tensor(temp.grad) for i in range(2): diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py index a3f1697032b55..35e7f47fefbac 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer.py @@ -37,7 +37,7 @@ def test_l2decay_regularizer(self): name="mul.x", regularizer=regularizer.L2DecayRegularizer(0.5), ) - self.assertTrue(mul_x.regularizer is not None) + self.assertIsNotNone(mul_x.regularizer) self.assertTrue( isinstance(mul_x.regularizer, regularizer.L2DecayRegularizer) ) @@ -82,7 +82,7 @@ def test_l2decay_regularizer(self): name="mul.x", regularizer=regularizer.L1DecayRegularizer(0.5), ) - self.assertTrue(mul_x.regularizer is not None) + self.assertIsNotNone(mul_x.regularizer) self.assertTrue( isinstance(mul_x.regularizer, regularizer.L1DecayRegularizer) ) diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 8e7de65896967..bfe7f16cea14f 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -494,7 +494,7 @@ def double_print_hook(grad): )[0] z = y + dx - self.assertTrue(x.grad is None) + self.assertIsNone(x.grad) # If create_graph = True, the gradient of dx # would be backpropagated. Therefore, diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 38e65744a8110..6adf1c7418013 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -125,7 +125,7 @@ def check_with_place(place): ) np.testing.assert_array_equal(x.numpy(), [1.0, 2.0]) self.assertEqual(x.dtype, core.VarDesc.VarType.FP32) - self.assertEqual(x.grad, None) + self.assertIsNone(x.grad) self.assertEqual(x.shape, [2]) self.assertEqual(x.stop_gradient, False) self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR) @@ -447,7 +447,7 @@ def func_test_detach(self): y = x**2 y.backward() self.assertTrue(cmp_float(x.grad.numpy(), [20.0])) - self.assertEqual(detach_x.grad, None) + self.assertIsNone(detach_x.grad) detach_x.stop_gradient = ( False # Set stop_gradient to be False, supported auto-grad @@ -1844,10 +1844,10 @@ def test_eager_tensor_grad_name_value(self): a = paddle.to_tensor(a_np) a.stop_gradient = False b = a**2 - self.assertEqual(a._grad_value(), None) + self.assertIsNone(a._grad_value()) b.backward() # Note, for new dygraph, there are no generated grad name, so we skip the name check. - self.assertNotEqual(a._grad_value(), None) + self.assertIsNotNone(a._grad_value()) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_var_info.py b/python/paddle/fluid/tests/unittests/test_var_info.py index 4bb6648488ee6..d03fb2387bd0d 100644 --- a/python/paddle/fluid/tests/unittests/test_var_info.py +++ b/python/paddle/fluid/tests/unittests/test_var_info.py @@ -32,7 +32,7 @@ def test_var_info(self): ret = var._get_info("name") assert ret == "test" ret = var._get_info("not_exist") - assert ret == None + assert ret is None if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index a13b73f501bd6..f40e111a6e3a7 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -41,7 +41,7 @@ def __create_var__(name, var_name): __create_var__(in_name, sub_in_name) else: __create_var__(in_name, in_name) - if cache_list != None and isinstance(cache_list, list): + if cache_list is not None and isinstance(cache_list, list): for name in cache_list: kwargs[name] = [] scope.var(name) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py index 21e46e31783a4..b439ffb5d20cb 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py @@ -72,7 +72,7 @@ def generate_compatible_shapes( dim_X, dim_Y, transpose_X, transpose_Y, batch_size ): BATCH_SIZE = 2 - if batch_size != None: + if batch_size is not None: BATCH_SIZE = batch_size M = 3 diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index f8151297c2d10..eaa60c7e3a95d 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -354,7 +354,7 @@ def _set_program(self, program): def _gen_trainer_desc(self): super(DistMultiTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "DistMultiTrainer" - if self._program == None: + if self._program is None: raise RuntimeError("None Program") self._device_worker._set_infer(self._infer) self._device_worker._set_program(self._program) @@ -378,7 +378,7 @@ def _set_program(self, program): def _gen_trainer_desc(self): super(HeterXpuTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "HeterXpuTrainer" - if self._program == None: + if self._program is None: raise RuntimeError("None Program") self._device_worker._set_infer(self._infer) self._device_worker._set_program(self._program) @@ -402,7 +402,7 @@ def _set_program(self, program): def _gen_trainer_desc(self): super(PSGPUTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "PSGPUTrainer" - if self._program == None: + if self._program is None: raise RuntimeError("None Program") self._device_worker._set_infer(self._infer) self._device_worker._set_program(self._program) @@ -426,7 +426,7 @@ def _set_program(self, program): def _gen_trainer_desc(self): super(HeterPipelineTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "HeterPipelineTrainer" - if self._program == None: + if self._program is None: raise RuntimeError("None Program") self._device_worker._set_infer(self._infer) self._device_worker._set_program(self._program) @@ -450,7 +450,7 @@ def _set_program(self, program): def _gen_trainer_desc(self): super(PipelineTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "PipelineTrainer" - if self._program == None: + if self._program is None: raise RuntimeError("None Program") self._device_worker._set_infer(self._infer) self._device_worker._set_program(self._program) diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 7ac367b38fd2c..761895fe3044f 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -192,7 +192,7 @@ def handler_launch_func(self, scope, handler): for key in var_name_to_key: var = scope.find_var(key) fetch_dict[key] = var - if var == None: + if var is None: local_logger.warning( "{} value currently not available".format( var_name_to_key[key] @@ -201,7 +201,7 @@ def handler_launch_func(self, scope, handler): res_dict = {} for key in fetch_dict: user_name = var_name_to_key[key] - if fetch_dict[key] == None: + if fetch_dict[key] is None: res_dict[user_name] = None continue else: diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 28dbf22153f77..6bdddddd93fe3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -692,7 +692,7 @@ def transpile( ps_dispatcher = self.config.split_method(self.pserver_endpoints) self.table_name = find_distributed_lookup_table(self.origin_program) - self.has_distributed_lookup_table = self.table_name != None + self.has_distributed_lookup_table = self.table_name is not None self.param_name_to_grad_name = dict() self.grad_name_to_param_name = dict() for param_var, grad_var in self.params_grads: diff --git a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py index 0163d591cb434..80b39dceaacc5 100644 --- a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py +++ b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py @@ -104,7 +104,7 @@ def transpile( # distribute lookup table self.table_name = find_distributed_lookup_table(self.origin_program) - self.has_distributed_lookup_table = self.table_name != None + self.has_distributed_lookup_table = self.table_name is not None self.origin_program._distributed_lookup_table = ( self.table_name if self.table_name else None ) diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index a61d05761303d..10b980aaabf07 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -1115,7 +1115,7 @@ def dropout( if axis and not isinstance(axis, (int, list, tuple)): raise TypeError("datatype of axis argument should be int or list") - if axis == None: # commonly used dropout + if axis is None: # commonly used dropout seed = None mode = ( 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 87d61e9180913..25008f7e2dc42 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -193,7 +193,7 @@ def batch_norm( data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC' - if use_global_stats == None: + if use_global_stats is None: use_global_stats = not training trainable_statistics = False else: diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index d81987fa9eedb..daa5c8d841a41 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -2174,9 +2174,9 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None): output_size = utils.convert_to_list(output_size, 2, 'output_size') else: output_size = list(output_size) - if output_size[0] == None: + if output_size[0] is None: output_size[0] = in_h - if output_size[1] == None: + if output_size[1] is None: output_size[1] = in_w if in_dygraph_mode(): pool_out = _C_ops.max_pool2d_with_index( @@ -2269,11 +2269,11 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None): output_size = utils.convert_to_list(output_size, 3, 'output_size') else: output_size = list(output_size) - if output_size[0] == None: + if output_size[0] is None: output_size[0] = in_l - if output_size[1] == None: + if output_size[1] is None: output_size[1] = in_h - if output_size[2] == None: + if output_size[2] is None: output_size[2] = in_w if in_dynamic_mode(): diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py index ef09a1cd5e2b5..a4f1b2b789d3e 100644 --- a/python/paddle/nn/layer/distance.py +++ b/python/paddle/nn/layer/distance.py @@ -78,6 +78,6 @@ def extra_repr(self): main_str += ', epsilon={epsilon}' if self.keepdim is not False: main_str += ', keepdim={keepdim}' - if self.name != None: + if self.name is not None: main_str += ', name={name}' return main_str.format(**self.__dict__) diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 5f4a4d8d1d8c7..8864237a2820a 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -394,7 +394,7 @@ def __init__( default_initializer=Constant(1.0), ) self.weight.stop_gradient = ( - self._weight_attr != None + self._weight_attr is not None and self._weight_attr.learning_rate == 0.0 ) @@ -411,7 +411,8 @@ def __init__( attr=self._bias_attr, shape=param_shape, is_bias=True ) self.bias.stop_gradient = ( - self._bias_attr != None and self._bias_attr.learning_rate == 0.0 + self._bias_attr is not None + and self._bias_attr.learning_rate == 0.0 ) def forward(self, input): @@ -635,7 +636,7 @@ def __init__( default_initializer=Constant(1.0), ) self.weight.stop_gradient = ( - self._weight_attr != None + self._weight_attr is not None and self._weight_attr.learning_rate == 0.0 ) @@ -656,7 +657,8 @@ def __init__( is_bias=True, ) self.bias.stop_gradient = ( - self._bias_attr != None and self._bias_attr.learning_rate == 0.0 + self._bias_attr is not None + and self._bias_attr.learning_rate == 0.0 ) moving_mean_name = None @@ -1293,15 +1295,15 @@ def convert_sync_batchnorm(cls, layer): layer_output = layer if isinstance(layer, _BatchNormBase): if ( - layer._weight_attr != None + layer._weight_attr is not None and not isinstance(layer._weight_attr, bool) - and layer._weight_attr.name != None + and layer._weight_attr.name is not None ): layer._weight_attr.name = layer._weight_attr.name + '_sync' if ( - layer._bias_attr != None + layer._bias_attr is not None and not isinstance(layer._bias_attr, bool) - and layer._bias_attr.name != None + and layer._bias_attr.name is not None ): layer._bias_attr.name = layer._bias_attr.name + '_sync' diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index a7b672383a9ec..3e64c409559d2 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -704,7 +704,7 @@ def _add_accumulator( name, param.name ) ) - if shape == None: + if shape is None: shape = param.shape assert isinstance(self.helper, LayerHelper) diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index bcd87d9c826fc..97c3f73beae8a 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -522,7 +522,7 @@ def __init__( else: self.scheduler = _default_state_scheduler - if on_trace_ready == None: + if on_trace_ready is None: self.on_trace_ready = export_chrome_tracing('./profiler_log/') else: self.on_trace_ready = on_trace_ready diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index 4d7b36554b590..d26e6de82f2f8 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -189,6 +189,6 @@ def warpper(*args, **kwargs): for classname in optimizer.__all__: if classname != 'Optimizer': classobject = getattr(optimizer, classname) - if getattr(classobject, 'step', None) != None: + if getattr(classobject, 'step', None) is not None: classobject.step = optimizer_warpper(classobject.step) _has_optimizer_wrapped = True diff --git a/python/paddle/sparse/nn/layer/norm.py b/python/paddle/sparse/nn/layer/norm.py index 117fbf01a1d6a..b3993a2947b00 100644 --- a/python/paddle/sparse/nn/layer/norm.py +++ b/python/paddle/sparse/nn/layer/norm.py @@ -129,7 +129,7 @@ def forward(self, input): "When training, we now always track global mean and variance." ) - if self._use_global_stats == None: + if self._use_global_stats is None: self._use_global_stats = not self.training trainable_statistics = False else: @@ -363,15 +363,15 @@ def convert_sync_batchnorm(cls, layer): layer_output = layer if isinstance(layer, _BatchNormBase): if ( - layer._weight_attr != None + layer._weight_attr is not None and not isinstance(layer._weight_attr, bool) - and layer._weight_attr.name != None + and layer._weight_attr.name is not None ): layer._weight_attr.name = layer._weight_attr.name + '_sync' if ( - layer._bias_attr != None + layer._bias_attr is not None and not isinstance(layer._bias_attr, bool) - and layer._bias_attr.name != None + and layer._bias_attr.name is not None ): layer._bias_attr.name = layer._bias_attr.name + '_sync' diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py index ac598328352a9..47e3dddbbd893 100644 --- a/python/paddle/static/io.py +++ b/python/paddle/static/io.py @@ -688,7 +688,7 @@ def deserialize_persistables(program, data, executor): if not isinstance(var, Parameter): continue var_tmp = paddle.fluid.global_scope().find_var(var.name) - assert var_tmp != None, "can't not find var: " + var.name + assert var_tmp is not None, "can't not find var: " + var.name new_shape = (np.array(var_tmp.get_tensor())).shape assert var.name in origin_shape_map, var.name + " MUST in var list." origin_shape = origin_shape_map.get(var.name) diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 19a63d515bea7..edf316bbf508a 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -218,7 +218,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims): else: count[-1] += 1 - if rhs != None: + if rhs is not None: validate_rhs(rhs, labels, n_bcast_dims) g_labels_out = rhs.replace('...', '.' * n_bcast_dims) else: diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 5348681ad04a7..257fad6cafa1d 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -466,9 +466,9 @@ def inf_norm( if in_dygraph_mode(): out = _C_ops.abs(input) reduce_all = ( - True if axis == None or axis == [] or asvector else False + True if axis is None or axis == [] or asvector else False ) - axis = axis if axis != None and axis != [] else [0] + axis = axis if axis is not None and axis != [] else [0] if reduce_all: assert (axis == []) or (axis is None) if porder == np.float64('inf'): @@ -485,8 +485,8 @@ def inf_norm( dtype=helper.input_dtype() ) - reduce_all = True if axis == None or axis == [] or asvector else False - axis = axis if axis != None and axis != [] else [0] + reduce_all = True if axis is None or axis == [] or asvector else False + axis = axis if axis is not None and axis != [] else [0] reduce_type = ( 'reduce_max' if porder == np.float64('inf') else 'reduce_min' @@ -830,7 +830,7 @@ def mat_norm(input, porder=1.0, axis=None): when porder is in (1, -1, inf, -inf) """ reduce_all = True if axis is None or axis == [] else False - axis = axis if axis != None and axis != [] else [0] + axis = axis if axis is not None and axis != [] else [0] keepdim = False if in_dygraph_mode(): @@ -1106,7 +1106,7 @@ def empty_tensor(input, shape): "input should be a matrix or batches of matrices, " + "but the dimention of received input is {}".format(len(x_shape)) ) - if p == None: + if p is None: p = 2 x_size = 0 if (0 in x_shape) else 1 if p in ("fro", "nuc", 1, -1, np.inf, -np.inf): diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 3379a60a3bc5e..84a7b7a385a1e 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -503,14 +503,14 @@ def unstack(x, axis=0, num=None): """ if in_dygraph_mode(): - if num == None: + if num is None: num = x.shape[axis] if num == 0: return [] return _C_ops.unstack(x, axis, num) if _non_static_mode(): - if num == None: + if num is None: num = x.shape[axis] if num == 0: return [] diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index f34851fdccade..eafa9944c3fda 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3921,13 +3921,13 @@ def all(x, axis=None, keepdim=False, name=None): return _C_ops.all(x, axis, keepdim) if _in_legacy_dygraph(): - axis = axis if axis != None and axis != [] else [0] + axis = axis if axis is not None and axis != [] else [0] return _legacy_C_ops.reduce_all( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all_flag ) attrs = { - 'dim': axis if axis != None and axis != [] and axis != () else [0], + 'dim': axis if axis is not None and axis != [] and axis != () else [0], 'keep_dim': keepdim, 'reduce_all': reduce_all_flag, } @@ -4010,13 +4010,13 @@ def any(x, axis=None, keepdim=False, name=None): return _C_ops.any(x, axis, keepdim) if _in_legacy_dygraph(): - axis = axis if axis != None and axis != [] else [0] + axis = axis if axis is not None and axis != [] else [0] return _legacy_C_ops.reduce_any( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all_flag ) attrs = { - 'dim': axis if axis != None and axis != [] and axis != () else [0], + 'dim': axis if axis is not None and axis != [] and axis != () else [0], 'keep_dim': keepdim, 'reduce_all': reduce_all_flag, } @@ -4322,7 +4322,7 @@ def logit(x, eps=None, name=None): """ - if eps == None: + if eps is None: eps = 0.0 if _in_legacy_dygraph(): return _legacy_C_ops.logit(x, 'eps', eps) diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 355bc63f037b6..62509aaedf8af 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -904,7 +904,7 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None): """ if in_dygraph_mode(): - if axis == None: + if axis is None: axis = -1 out, indices = _C_ops.topk(x, k, axis, largest, sorted) return out, indices diff --git a/python/paddle/tests/test_utils_lazyimport.py b/python/paddle/tests/test_utils_lazyimport.py index 98324650bf0a9..1064bd8086422 100644 --- a/python/paddle/tests/test_utils_lazyimport.py +++ b/python/paddle/tests/test_utils_lazyimport.py @@ -22,7 +22,7 @@ def setup(self): def func_test_lazy_import(self): paddle = try_import('paddle') - self.assertTrue(paddle.__version__ is not None) + self.assertIsNotNone(paddle.__version__) with self.assertRaises(ImportError) as context: paddle2 = try_import('paddle2') diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py index d71d23ff69231..cc93dc5f52da4 100644 --- a/python/paddle/text/datasets/imdb.py +++ b/python/paddle/text/datasets/imdb.py @@ -112,7 +112,7 @@ def _tokenize(self, pattern): data = [] with tarfile.open(self.data_file) as tarf: tf = tarf.next() - while tf != None: + while tf is not None: if bool(pattern.match(tf.name)): # newline and punctuations removal and ad-hoc tokenization. data.append( diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 1e8fc049efda7..05eea5802b5b0 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -583,7 +583,7 @@ def _is_list_or_tuple_(data): if in_dygraph_mode(): step_w, step_h = steps - if max_sizes == None: + if max_sizes is None: max_sizes = [] box, var = _C_ops.prior_box( input, diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py index b184ef76fcc54..200116779dbeb 100644 --- a/tools/analysisPyXml.py +++ b/tools/analysisPyXml.py @@ -67,7 +67,7 @@ def analysisPyXml(rootPath, ut): ) ): pattern = r"""(.*) = ('*')|(.*) = ("*")|(.*) = (\d)|(.*) = (-\d)|(.*) = (None)|(.*) = (True)|(.*) = (False)|(.*) = (URL_PREFIX*)|(.*) = (\[)|(.*) = (\{)|(.*) = (\()""" # a='b'/a="b"/a=0 - if re.match(pattern, output.strip()) == None: + if re.match(pattern, output.strip()) is None: pyCov_file.append(clazz_filename) coverageMessage = 'RELATED' break diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py index 1c1ff14f7b68c..3bb1f23825394 100644 --- a/tools/check_op_desc.py +++ b/tools/check_op_desc.py @@ -173,7 +173,7 @@ def diff_attr(ori_attrs, new_attrs): for attr_name in attrs_only_in_new: attr_added_error_massage.append(attr_name) - if new_attrs.get(attr_name).get(DEFAULT_VALUE) == None: + if new_attrs.get(attr_name).get(DEFAULT_VALUE) is None: error, attr_error = True, True attr_added_def_error_massage.append(attr_name) diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py index ee5f2d9fd5055..10aaa96ce7be1 100644 --- a/tools/get_single_test_cov.py +++ b/tools/get_single_test_cov.py @@ -88,7 +88,7 @@ def analysisFNDAFile(rootPath, test): fn, re.I, ) - if matchObj == None: + if matchObj is None: OP_REGIST = False break if not OP_REGIST: diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 7ece773aa7855..6c5d879df6c03 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -354,7 +354,7 @@ def convert_op_proto_into_mlir(op_descs): attr in skipped_attr_list ): continue - if op_proto[ATTRS][attr][DEFAULT_VALUE] != None: + if op_proto[ATTRS][attr][DEFAULT_VALUE] is not None: if op_proto[ATTRS][attr][TYPE] in attr_mlir_converter: default_value = str( op_proto[ATTRS][attr][DEFAULT_VALUE] diff --git a/tools/test_runner.py b/tools/test_runner.py index 65da72b539df7..2f1b9a22ab3b9 100644 --- a/tools/test_runner.py +++ b/tools/test_runner.py @@ -28,7 +28,7 @@ def main(): sys.path.append(os.getcwd()) if core.is_compiled_with_cuda() or core.is_compiled_with_rocm(): - if os.getenv('FLAGS_enable_gpu_memory_usage_log') == None: + if os.getenv('FLAGS_enable_gpu_memory_usage_log') is None: os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true' os.environ['FLAGS_enable_gpu_memory_usage_log_mb'] = 'false' From d38010e87c880bdcf4ef618f45fbae015cd480e5 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 1 Nov 2022 22:15:46 +0800 Subject: [PATCH 73/91] [CodeStyle][E711][E712] update flake8 config (#47465) * [CodeStyle][E711][E712] update flake8 config * empty commit, test=document_fix --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index a9e2e84c94d35..7b8120753dfea 100644 --- a/.flake8 +++ b/.flake8 @@ -19,7 +19,7 @@ ignore = E203,E262,E265,E266, E401,E402, E501, - E711,E712,E721,E722,E731,E741, + E721,E722,E731,E741, # F, see https://flake8.pycqa.org/en/latest/user/error-codes.html F405, From eb100c7b4b3cb0f02943c886e0b0b2af025641dd Mon Sep 17 00:00:00 2001 From: Tian Zheng Date: Wed, 2 Nov 2022 10:15:25 +0800 Subject: [PATCH 74/91] Add build option for CUDNN Frontend API (#47524) * Add build option for CUDNN Frontend API * Fix review comments * Change namespace for cudnn_frontend.h --- CMakeLists.txt | 2 + cmake/configure.cmake | 4 + cmake/external/cudnn-frontend.cmake | 60 ++++++++ cmake/flags.cmake | 5 + cmake/third_party.cmake | 5 + paddle/fluid/platform/flags.cc | 12 ++ paddle/phi/backends/dynload/CMakeLists.txt | 7 + paddle/phi/backends/dynload/cudnn.cc | 4 + paddle/phi/backends/dynload/cudnn.h | 13 ++ paddle/phi/backends/dynload/cudnn_frontend.h | 62 ++++++++ .../backends/dynload/cudnn_frontend_test.cc | 44 ++++++ .../0001-patch-for-paddle.patch | 137 ++++++++++++++++++ 12 files changed, 355 insertions(+) create mode 100644 cmake/external/cudnn-frontend.cmake create mode 100644 paddle/phi/backends/dynload/cudnn_frontend.h create mode 100644 paddle/phi/backends/dynload/cudnn_frontend_test.cc create mode 100644 patches/cudnn-frontend/0001-patch-for-paddle.patch diff --git a/CMakeLists.txt b/CMakeLists.txt index f2489526c5c89..187162cffe722 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -305,6 +305,8 @@ option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF) option(WITH_FLPS "FL PS mode" OFF) option(WITH_RPC "Compile with rpc support" ${WITH_DISTRIBUTE}) +option(WITH_CUDNN_FRONTEND + "Compile with CUDNN Frontend API support (experimental)" OFF) if(WITH_RECORD_BUILDTIME) set_property( diff --git a/cmake/configure.cmake b/cmake/configure.cmake index c49a879fa0291..5147e54ea71fc 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -248,3 +248,7 @@ endif() if(WITH_GPU_GRAPH) add_definitions(-DPADDLE_WITH_GPU_GRAPH) endif() + +if(WITH_CUDNN_FRONTEND) + add_definitions(-DPADDLE_WITH_CUDNN_FRONTEND) +endif() diff --git a/cmake/external/cudnn-frontend.cmake b/cmake/external/cudnn-frontend.cmake new file mode 100644 index 0000000000000..10a49110f842b --- /dev/null +++ b/cmake/external/cudnn-frontend.cmake @@ -0,0 +1,60 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(CUDNN_FRONTEND_CUDNN_MIN_VERSION 8000) + +if(NOT WITH_GPU) + message(FATAL_ERROR "Can't enable CUDNN Frontend API without CUDA.") +endif() +if(CUDNN_VERSION LESS 8000) + message( + FATAL_ERROR + "Minimum CUDNN version is ${CUDNN_FRONTEND_CUDNN_MIN_VERSION}. Current: ${CUDNN_VERSION}" + ) +endif() + +# Version: v0.7.1 +set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend) +set(CUDNN_FRONTEND_SOURCE_DIR + ${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include) +set(CUDNN_FRONTEND_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git) +set(CUDNN_FRONTEND_TAG v0.7.1) + +set(CUDNN_FRONTEND_INCLUDE_DIR ${CUDNN_FRONTEND_SOURCE_DIR}) +include_directories(${CUDNN_FRONTEND_INCLUDE_DIR}) + +message( + STATUS + "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_TAG}. Directory: ${CUDNN_FRONTEND_INCLUDE_DIR}" +) + +ExternalProject_Add( + extern_cudnn_frontend + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CUDNN_FRONTEND_REPOSITORY} + GIT_TAG ${CUDNN_FRONTEND_TAG} + PREFIX ${CUDNN_FRONTEND_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + patch -d ${CUDNN_FRONTEND_SOURCE_DIR} -p2 < + ${PADDLE_SOURCE_DIR}/patches/cudnn-frontend/0001-patch-for-paddle.patch + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") + +add_library(cudnn-frontend INTERFACE) +add_dependencies(cudnn-frontend extern_cudnn_frontend) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 39261a788bd18..a58f4094bbbc4 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -162,6 +162,11 @@ if(NOT WIN32) ) endif() + if(WITH_CUDNN_FRONTEND) + # flags from https://github.com/NVIDIA/cudnn-frontend/blob/v0.7.1/CMakeLists.txt + set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor) + endif() + if(WITH_ASCEND_CL AND WITH_ARM_BRPC) set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new) endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 4475f5b14d28e..28b2bae951879 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -515,4 +515,9 @@ if(WITH_GPU endif() endif() +if(WITH_CUDNN_FRONTEND) + include(external/cudnn-frontend) # download cudnn-frontend + list(APPEND third_party_deps extern_cudnn_frontend) +endif() + add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index bac075c1d9053..23ecfecbbd2d9 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1021,3 +1021,15 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_string(jit_engine_type, "Predictor", "Choose default funciton type in JitLayer."); + +#ifdef PADDLE_WITH_CUDNN_FRONTEND +/** + * CUDNNv8 related FLAG + * Name: enable_cudnn_frontend + * Since Version: 2.5.0 + * Value Range: bool, default=false + * Example: + * Note: Enable CUDNNv8 Frontend API for CUDNN kernels. + */ +PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, ""); +#endif // PADDLE_WITH_CUDNN_FRONTEND diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 49ab8d4f0c91a..98a44461ac4b2 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -99,3 +99,10 @@ if(MKL_FOUND AND WITH_ONEMKL) DEPS phi_dynamic_loader) target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() + +if(WITH_CUDNN_FRONTEND) + nv_test( + cudnn_frontend_test + SRCS cudnn_frontend_test.cc + DEPS phi_dynload_cuda cudnn-frontend) +endif() diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc index 8aa3b623273d7..9bd38a89ab177 100644 --- a/paddle/phi/backends/dynload/cudnn.cc +++ b/paddle/phi/backends/dynload/cudnn.cc @@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP); +#endif + bool HasCUDNN() { std::call_once(cudnn_dso_flag, []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h index 7b9004308e95b..3292beb037110 100644 --- a/paddle/phi/backends/dynload/cudnn.h +++ b/paddle/phi/backends/dynload/cudnn.h @@ -194,6 +194,19 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +#ifdef PADDLE_WITH_CUDNN_FRONTEND +#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \ + __macro(cudnnBackendCreateDescriptor); \ + __macro(cudnnBackendDestroyDescriptor); \ + __macro(cudnnBackendExecute); \ + __macro(cudnnBackendFinalize); \ + __macro(cudnnBackendGetAttribute); \ + __macro(cudnnBackendSetAttribute); \ + __macro(cudnnGetStream); \ + __macro(cudnnReorderFilterAndBias); +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + } // namespace dynload } // namespace phi diff --git a/paddle/phi/backends/dynload/cudnn_frontend.h b/paddle/phi/backends/dynload/cudnn_frontend.h new file mode 100644 index 0000000000000..4d0b67ce2285c --- /dev/null +++ b/paddle/phi/backends/dynload/cudnn_frontend.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/gpu/gpu_info.h" + +DECLARE_bool(enable_cudnn_frontend); + +// Redirect the CUDNN APIs in the cudnn_frontend namespace to +// the functions in phi::dynload +#define CUDNN_FRONTEND_OVERRIDE_SYMBOL(__name) using phi::dynload::__name + +#define CUDNN_FRONTEND_APPLY_EACH(__macro) \ + __macro(cudnnBackendCreateDescriptor); \ + __macro(cudnnBackendDestroyDescriptor); \ + __macro(cudnnBackendExecute); \ + __macro(cudnnBackendFinalize); \ + __macro(cudnnBackendGetAttribute); \ + __macro(cudnnBackendSetAttribute); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnGetStream); \ + __macro(cudnnGetVersion); \ + __macro(cudnnReorderFilterAndBias); \ + __macro(cudnnSetFilterNdDescriptor); + +namespace cudnn_frontend { +CUDNN_FRONTEND_APPLY_EACH(CUDNN_FRONTEND_OVERRIDE_SYMBOL); +} // namespace cudnn_frontend + +// clang-format off +#include // NOLINT +#include // NOLINT +#include // NOLINT +// clang-format on + +namespace phi { +namespace dynload { +inline bool IsCudnnFrontendEnabled() { + int cudnn_version = phi::backends::gpu::DnnVersion(); + bool flag_enabled = FLAGS_enable_cudnn_frontend && (cudnn_version >= 8000); + VLOG(3) << "[cudnn_frontend] flag_enabled=" << flag_enabled; + return flag_enabled; +} +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/cudnn_frontend_test.cc b/paddle/phi/backends/dynload/cudnn_frontend_test.cc new file mode 100644 index 0000000000000..03425a7545e8d --- /dev/null +++ b/paddle/phi/backends/dynload/cudnn_frontend_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/dynload/cudnn_frontend.h" + +TEST(CudnnFrontendTest, TensorCreation) { + // Consider creation of a 2d Tensor + // n,c,h,w as 4,32,32,32 + std::cout << "Tensor creation comparison" << std::endl; + std::array tensor_dim = {4, 32, 32, 32}; + std::array tensor_str = {32768, 1024, 32, 1}; // NCHW format + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + int64_t alignment = sizeof(float); + int64_t id = 0xD0D0CACA; // Some magic number + + try { + auto tensor = cudnn_frontend::TensorBuilder() + .setDim(tensor_dim.size(), tensor_dim.data()) + .setStrides(tensor_str.size(), tensor_str.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(data_type) + .build(); + } catch (cudnn_frontend::cudnnException &e) { + std::cout << "Exception in tensor creation " << e.what() << std::endl; + FAIL(); + } + std::cout << "Finished tensor creation." << std::endl; +} diff --git a/patches/cudnn-frontend/0001-patch-for-paddle.patch b/patches/cudnn-frontend/0001-patch-for-paddle.patch new file mode 100644 index 0000000000000..bf5288f06eea2 --- /dev/null +++ b/patches/cudnn-frontend/0001-patch-for-paddle.patch @@ -0,0 +1,137 @@ +From dce3465da518641ee177187fbc0c0d36faea28f2 Mon Sep 17 00:00:00 2001 +From: Tian Zheng +Date: Thu, 27 Oct 2022 20:33:16 -0700 +Subject: [PATCH] patch for paddle + +--- + include/cudnn_frontend_ExecutionPlan.h | 10 +++++++--- + include/cudnn_frontend_ExecutionPlanCache.h | 2 +- + include/cudnn_frontend_OperationGraph.h | 2 +- + include/cudnn_frontend_find_plan.h | 6 +++--- + include/cudnn_frontend_get_plan.h | 4 ++-- + 5 files changed, 14 insertions(+), 10 deletions(-) + +diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h +index 7bed4b4..3314b5c 100644 +--- a/include/cudnn_frontend_ExecutionPlan.h ++++ b/include/cudnn_frontend_ExecutionPlan.h +@@ -167,6 +167,10 @@ class ExecutionPlan_v8 : public BackendDescriptor { + return json_string; + #endif + } ++ ++ ManagedOpaqueDescriptor GetEngineConfig() const { ++ return engine_config; ++ } + + ExecutionPlan_v8(ExecutionPlan_v8 const &) = default; + ExecutionPlan_v8 & +@@ -182,7 +186,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { + CUDNN_TYPE_NUMERICAL_NOTE, + CUDNN_NUMERICAL_NOTE_TYPE_COUNT, + &elem_count, +- NULL); ++ nullptr); + numeric_notes_vec.resize(elem_count); + status = cudnnBackendGetAttribute(extractedEngine_, + CUDNN_ATTR_ENGINE_NUMERICAL_NOTE, +@@ -206,7 +210,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { + CUDNN_TYPE_BEHAVIOR_NOTE, + CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, + &elem_count, +- NULL); ++ nullptr); + behavior_notes_vec.resize(elem_count); + status = cudnnBackendGetAttribute(extractedEngine_, + CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE, +@@ -310,7 +314,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { + CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, + CUDNN_TYPE_INT64, + 1, +- NULL, ++ nullptr, + &workSpaceSize); + if (status != CUDNN_STATUS_SUCCESS) { + set_error_and_throw_exception(this, +diff --git a/include/cudnn_frontend_ExecutionPlanCache.h b/include/cudnn_frontend_ExecutionPlanCache.h +index 99a157c..741c490 100644 +--- a/include/cudnn_frontend_ExecutionPlanCache.h ++++ b/include/cudnn_frontend_ExecutionPlanCache.h +@@ -94,7 +94,7 @@ class ExecutionPlanCache_v1 { + + /// String to map of feature_vector to execution plan + /// For a given FeatureVector of type T according to the Operation Graph, we get the plan. +- using FeatureVectorToPlanMap = std::map; ++ using FeatureVectorToPlanMap = std::map; + FeatureVectorToPlanMap cache; + + mutable std::mutex cache_mutex; +diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h +index 1478ce8..7894080 100644 +--- a/include/cudnn_frontend_OperationGraph.h ++++ b/include/cudnn_frontend_OperationGraph.h +@@ -78,7 +78,7 @@ class OperationGraph_v8 : public BackendDescriptor { + CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT, + CUDNN_TYPE_INT64, + 1, +- NULL, ++ nullptr, + &global_count); + if (status != CUDNN_STATUS_SUCCESS) { + set_error_and_throw_exception(this, +diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h +index 02a08a1..5f94e45 100644 +--- a/include/cudnn_frontend_find_plan.h ++++ b/include/cudnn_frontend_find_plan.h +@@ -53,7 +53,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const + cudaDeviceSynchronize(); + + cudaStream_t stream = nullptr; +- ::cudnnGetStream(handle, &stream); ++ cudnnGetStream(handle, &stream); + + for (auto &plan : plans) { + float time_ms = 0.0f; +@@ -61,7 +61,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const + float min_time_ms = std::numeric_limits::max(); + + // Warm-up run +- auto warmup_status = ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); ++ auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); + if (warmup_status != CUDNN_STATUS_SUCCESS) { + getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status) << std::endl; + continue; +@@ -71,7 +71,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const + for (int i = 0; i < maxIterCount; i++) { + cudaEventRecord(start, stream); + +- ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); ++ cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); + + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); +diff --git a/include/cudnn_frontend_get_plan.h b/include/cudnn_frontend_get_plan.h +index 50535ab..c43eec9 100644 +--- a/include/cudnn_frontend_get_plan.h ++++ b/include/cudnn_frontend_get_plan.h +@@ -26,7 +26,7 @@ + + namespace cudnn_frontend { + +-auto ++inline auto + EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph) + -> executionPlans_t { + // Creating a set of execution plans that are supported. +@@ -47,7 +47,7 @@ EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGra + return plans; + } + +-auto ++inline auto + EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph, Predicate pred) + -> executionPlans_t { + // Creating a set of execution plans that are supported. +-- +2.25.1 + From 75b73400c4cf8ce77f15a357e1737142ccafe9bc Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 2 Nov 2022 10:28:26 +0800 Subject: [PATCH 75/91] fix sparse_attention unittest (#47547) --- python/paddle/fluid/tests/unittests/test_sparse_attention_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py index 92e2d0200c80d..e10b20a073aec 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -405,7 +405,7 @@ def test_static_graph(self): ) np.testing.assert_allclose( - fetches_result, expected_result, rtol=1e-05, atol=1e-05 + fetches_result[0], expected_result, rtol=1e-05, atol=1e-05 ) def test_dygraph(self): From 77395619769b734fbb001a6340ef16ac0e8beeea Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Wed, 2 Nov 2022 10:34:33 +0800 Subject: [PATCH 76/91] [XPU] add int64 support for slice and subtract. (#47409) * [XPU] add int64 support for slice and subtract. test=kunlun * try to fix xpu compile. test=kunlun * try to fix xpu compile. test=kunlun * try to fix xpu compile. test=kunlun * remove unnecessary modification. test=kunlun --- paddle/fluid/platform/device/xpu/xpu2_op_list.h | 6 ++++-- paddle/phi/kernels/elementwise_kernel.cc | 3 ++- paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc | 3 ++- paddle/phi/kernels/xpu/slice_kernel.cc | 3 ++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 8f487cf6cd72e..73898354dc19d 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -178,7 +178,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"elementwise_sub", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"elementwise_mod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), @@ -497,7 +498,8 @@ XPUOpMap& get_kl2_ops() { {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), - pOpKernelType(vartype::INT32, XPUPlace())})}, + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc index ba58bae0035d1..88551b34109b6 100644 --- a/paddle/phi/kernels/elementwise_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -342,7 +342,8 @@ PD_REGISTER_KERNEL(subtract, ALL_LAYOUT, phi::SubtractKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + int64_t) {} #endif #if defined PADDLE_WITH_XPU diff --git a/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc index 299b5f80d7dde..4e18264d71343 100644 --- a/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc +++ b/paddle/phi/kernels/xpu/elementwise_subtract_kernel.cc @@ -36,4 +36,5 @@ PD_REGISTER_KERNEL(subtract_raw, ALL_LAYOUT, phi::SubtractRawKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + int64_t) {} diff --git a/paddle/phi/kernels/xpu/slice_kernel.cc b/paddle/phi/kernels/xpu/slice_kernel.cc index 3d01fae33e1fe..b30c6908357a5 100644 --- a/paddle/phi/kernels/xpu/slice_kernel.cc +++ b/paddle/phi/kernels/xpu/slice_kernel.cc @@ -113,4 +113,5 @@ PD_REGISTER_KERNEL(slice, phi::SliceRawKernel, float, int, - phi::dtype::float16) {} + phi::dtype::float16, + int64_t) {} From bafa890a7ce32e609ce795602aad0b6f0e637476 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 2 Nov 2022 10:40:29 +0800 Subject: [PATCH 77/91] Support generating static code of high order grad op by yaml (#47511) * support generating static code of high order grad op by yaml * polish code --- paddle/fluid/operators/activation_op.cc | 88 ------------------- paddle/phi/api/yaml/backward.yaml | 35 ++++++++ paddle/phi/api/yaml/generator/generate_op.py | 24 ++++- .../generator/templates/operator_utils.c.j2 | 12 +-- paddle/phi/api/yaml/legacy_backward.yaml | 35 -------- paddle/phi/api/yaml/legacy_ops.yaml | 10 --- paddle/phi/api/yaml/op_compat.yaml | 10 ++- paddle/phi/api/yaml/ops.yaml | 10 +++ paddle/phi/ops/compat/activation_sig.cc | 21 ----- 9 files changed, 81 insertions(+), 164 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index b4cf9e9e009de..41f813b436293 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -161,13 +161,6 @@ Relu Activation Operator. )DOC"; -UNUSED constexpr char TanhDoc[] = R"DOC( -Tanh Activation Operator. - -$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ - -)DOC"; - UNUSED constexpr char TanhShrinkDoc[] = R"DOC( TanhShrink Activation Operator. @@ -529,7 +522,6 @@ It is recommended to use the defaults for this activation. REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); -REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc); @@ -699,54 +691,6 @@ class SigmoidTripleGradMaker } }; -template -class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { - public: - using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("tanh_grad_grad"); - // input1: Out - op->SetInput("Out", this->Input("Out")); - // input2: ddx - op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); - op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); - op->SetAttrMap(this->Attrs()); - // output: ddy - op->SetOutput("DOutNew", this->InputGrad("Out")); - op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); - } -}; - -template -class TanhTripleGradMaker : public ::paddle::framework::SingleGradOpMaker { - public: - using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("tanh_triple_grad"); - // Out, DDX, DOut, D_DDOut, D_DOut_New // input - // D_OutNew, D_DOut, D_DDx // output - // input1: Out - op->SetInput("Out", this->Input("Out")); - // input2: ddx - op->SetInput("DDX", this->Input("DDX")); - // input3: dout - op->SetInput("DOut", this->Input("DOut")); - // input4: d_ddout - op->SetInput("D_DDOut", this->OutputGrad("DDOut")); - // input5: d_dout_new - op->SetInput("D_DOut_New", this->OutputGrad("DOutNew")); - op->SetAttrMap(this->Attrs()); - - // output: d_dOut, d_OutNew, d_ddx - op->SetOutput("D_OutNew", this->InputGrad("Out")); - op->SetOutput("D_DOut", this->InputGrad("DOut")); - op->SetOutput("D_DDx", this->InputGrad("DDX")); - } -}; // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -1103,38 +1047,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad, /* ========================================================================== */ -/* ========================== tanh register ============================= */ -REGISTER_OPERATOR( - tanh, - ops::ActivationOp, - ops::TanhOpMaker, - ops::ActivationOpInferVarType, - ops::ActivationGradOpMaker::FwdDeps(), - paddle::framework::OpDesc>, - ops::ActivationGradOpMaker::FwdDeps(), - paddle::imperative::OpBase>, - std::conditional>(), - ops::ActFwdInplaceInferer, - void>::type); -REGISTER_OPERATOR(tanh_grad, - ops::ActivationOpGrad, - ops::ActivationGradOpInplaceInferer, - ops::TanhDoubleGradMaker, - ops::TanhDoubleGradMaker) -REGISTER_OPERATOR( - tanh_grad_grad, - ops::ActivationOpDoubleGrad::FwdDeps()>, - ops::ActivationDoubleGradOpInplaceInferer, - ops::TanhTripleGradMaker, - ops::TanhTripleGradMaker); - -REGISTER_OPERATOR( - tanh_triple_grad, - ops::ActivationOpTripleGrad::FwdDeps()>, - ops::ActivationTripleGradOpInplaceInferer); - -/* ========================================================================== */ - /* ========================== relu register ============================= */ REGISTER_OPERATOR( relu, diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index a3611bcca3477..faf2d7660ea37 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -521,6 +521,41 @@ func : tan_grad inplace : (out_grad -> x_grad) +- backward_op : tanh_double_grad + forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) + args : (Tensor out, Tensor grad_out, Tensor grad_x_grad) + output : Tensor(out_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [out, out] + kernel : + func : tanh_double_grad + backward : tanh_triple_grad + inplace : (grad_x_grad -> grad_out_grad) + +- backward_op : tanh_grad + forward : tanh (Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : tanh_grad + backward : tanh_double_grad + inplace : (out_grad -> x_grad) + +- backward_op : tanh_triple_grad + forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad) + args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad) + output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [out, out, grad_x_grad_forward] + kernel : + func : tanh_triple_grad + inplace : (grad_x_grad_forward -> grad_out_forward_grad) + - backward_op : trace_grad forward : trace (Tensor x, int offset, int axis1, int axis2) -> Tensor(out) args : (Tensor x, Tensor out_grad, int offset, int axis1, int axis2) diff --git a/paddle/phi/api/yaml/generator/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py index df2281ee3d8a1..777b5283743fd 100644 --- a/paddle/phi/api/yaml/generator/generate_op.py +++ b/paddle/phi/api/yaml/generator/generate_op.py @@ -86,12 +86,30 @@ def get_api_and_op_name(api_item): if api_name != op_name: forward_api_item['op_name'] = op_name if 'backward' in api_args and has_backward: - bw_api_name, bw_op_name = get_api_and_op_name( - api_args['backward'].split(',')[0] - ) + backward_op_list = api_args['backward'].split(',') + bw_api_name, bw_op_name = get_api_and_op_name(backward_op_list[0]) forward_api_item['backward'] = bw_op_name backward_api_item['op_name'] = bw_op_name + # for double grad + if len(backward_op_list) > 1: + double_grad_api_name, double_grad_op_name = get_api_and_op_name( + backward_op_list[1] + ) + double_grad_item = backward_api_dict[double_grad_api_name] + backward_api_item['backward'] = double_grad_op_name + double_grad_item['op_name'] = double_grad_op_name + + # for triple grad + if len(backward_op_list) > 2: + ( + triple_grad_api_name, + triple_grad_op_name, + ) = get_api_and_op_name(backward_op_list[2]) + triple_grad_item = backward_api_dict[triple_grad_api_name] + double_grad_item['backward'] = triple_grad_op_name + triple_grad_item['op_name'] = triple_grad_op_name + key_set = ['inputs', 'attrs', 'outputs'] args_map = {} for key in key_set: diff --git a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 index 60fd251f446d2..502266ce7a994 100644 --- a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 @@ -389,7 +389,7 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker forward_output_orig_names)}}); {% endfor %} - grad_op->SetAttrMap(this->Attrs()); + grad_op->SetAttrMap(this->Attrs()); {% for attr in api["attrs"] %} {% set attr_name = attr["name"] %} {% if attr_name in forward_attr_names %} @@ -456,15 +456,15 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker input_orig_names, output_orig_names) %}{# inline #} {% if name in input_names %} {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%} -Input("{{name_in_forward_orig}}") +Input({{name_in_forward_orig | to_opmaker_name}}) {%- elif name in output_names %} {% set name_in_forward_orig = output_orig_names[output_names.index(name)]%} -Output("{{name}}") +Output({{name | to_opmaker_name}}) {%- elif name.endswith("_grad") %}{# output grad#} {% set name_in_forward = name[:-5] %} {% if name_in_forward in output_names %} {% set name_in_forward_orig = output_orig_names[output_names.index(name_in_forward)] %} -OutputGrad("{{name_in_forward_orig}}") +OutputGrad({{name_in_forward_orig | to_opmaker_name}}) {%- endif %} {%- endif %} {%- endmacro %} @@ -474,11 +474,11 @@ OutputGrad("{{name_in_forward_orig}}") {% if name[:-5] in input_names %} {% set name_in_forward = name[:-5] %} {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%} -InputGrad("{{name[:-5]}}") +InputGrad({{name_in_forward_orig | to_opmaker_name}}) {%- elif (name | to_input_name) in input_names %} {% set name_in_forward = name | to_input_name %} {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%} -InputGrad("{{name | to_input_name}}") +InputGrad({{name | to_input_name | to_opmaker_name}}) {%- endif %} {%- endmacro %} diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 4e9a4abfcdb65..d4eade92eeda2 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -2112,30 +2112,6 @@ kernel : func : take_along_axis_grad -- backward_op : tanh_double_grad - forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x) - args : (Tensor out, Tensor grad_out, Tensor grad_x_grad) - output : Tensor(out_grad), Tensor(grad_out_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [out, out] - kernel : - func : tanh_double_grad - backward : tanh_triple_grad - inplace : (grad_x_grad -> grad_out_grad) - -- backward_op : tanh_grad - forward : tanh (Tensor x) -> Tensor(out) - args : (Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out] - kernel : - func : tanh_grad - backward : tanh_double_grad - inplace : (out_grad -> x_grad) - - backward_op : tanh_shrink_grad forward : tanh_shrink (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -2147,17 +2123,6 @@ func : tanh_shrink_grad inplace : (out_grad -> x_grad) -- backward_op : tanh_triple_grad - forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad) - args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad) - output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad) - infer_meta : - func : GeneralTernaryGradInferMeta - param : [out, out, grad_x_grad_forward] - kernel : - func : tanh_triple_grad - inplace : (grad_x_grad_forward -> grad_out_forward_grad) - - backward_op : temporal_shift_grad forward : temporal_shift(Tensor x, int seg_num, float shift_ratio, str data_format_str) -> Tensor(out) args : (Tensor out_grad, int seg_num, float shift_ratio, str data_format_str) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 03dbb08b59fb1..3c3e050b38e7b 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -2394,16 +2394,6 @@ data_type : arr backward : take_along_axis_grad -- op : tanh - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - kernel : - func : tanh - inplace : (x -> out) - backward : tanh_grad - - op : tanh_shrink args : (Tensor x) output : Tensor diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 2857beccb10d2..e8a587a5da1a2 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -5,6 +5,10 @@ - op : abs backward : abs_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false] @@ -889,7 +893,11 @@ attrs : [bool use_mkldnn = false, bool use_cudnn = false] - op : tanh - backward : tanh_grad + backward : tanh_grad, tanh_double_grad (tanh_grad_grad), tanh_triple_grad + inputs : + x : X + outputs : + out : Out extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 5fd80df6864cf..78eb97984013e 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -461,6 +461,16 @@ func : tan backward : tan_grad +- op : tanh + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : tanh + inplace : (x -> out) + backward : tanh_grad + - op : trace args : (Tensor x, int offset = 0, int axis1 = 0, int axis2 = 1) output : Tensor diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 85e8f7c2de721..4d115eed62607 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -67,7 +67,6 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Softplus, "beta" comma "threshold"); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sqrt, "sqrt", ); // NOLINT DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Rsqrt, "rsqrt", ); // NOLINT @@ -94,20 +93,6 @@ KernelSignature ReluDoubleGradOpArgumentMapping( return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); } -KernelSignature TanhDoubleGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "tanh_double_grad", {"Out", "DOut", "DDX"}, {}, {"DOutNew", "DDOut"}); -} - -KernelSignature TanhTripleGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("tanh_triple_grad", - {"Out", "DOut", "DDX", "D_DOut_New", "D_DDOut"}, - {}, - {"D_OutNew", "D_DOut", "D_DDx"}); -} - KernelSignature SigmoidDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -198,7 +183,6 @@ KernelSignature PowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); -PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink); PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad); @@ -227,11 +211,6 @@ PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, - phi::TanhDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, - phi::TanhTripleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::HardTanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, From 99f601885387a90bb9185f2d3d7e1b7b5ed859f5 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Wed, 2 Nov 2022 10:50:32 +0800 Subject: [PATCH 78/91] support unbalanced data for pipeline (#47199) * add unbalanced data * fix utest --- .../fleet/meta_parallel/pipeline_parallel.py | 78 ++++++++++--------- ...parallel_pp_transformer_unbalanced_data.py | 67 ++++++++++++++++ ...test_parallel_dygraph_pipeline_parallel.py | 7 ++ 3 files changed, 115 insertions(+), 37 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 89a3d619218b8..b7d1eb39c0174 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -355,51 +355,55 @@ def _backward_step(self, input_tensor, output_tensor, output_tensor_grad): input_tensor_grad = input_tensor.grad return input_tensor_grad - def _load_micro_batch(self, cache_id): - inputs = self.data + def _check_data_vaild(self, data): + batch_size = data.shape[0] + assert self.micro_batch_size * self.accumulate_steps == batch_size, ( + "batch_size needs to be divisible by micro_batch_size. Currently, " + "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d." + % (batch_size, self.micro_batch_size, self.accumulate_steps) + ) + + def _load_micro_batch_impl(self, inputs, cache_id): begin = cache_id * self.micro_batch_size end = begin + self.micro_batch_size - # The virtual first and last pipeline stage need data, all others don't need. + if isinstance(inputs, tuple): + output = [] + for data in inputs: + if isinstance(data, list): + assert ( + len(data) == self.accumulate_steps + ), "length of data should be %d, but it is %d" % ( + self.accumulate_steps, + len(data), + ) + output.append(data[cache_id].detach()) + else: + self._check_data_vaild(data) + output.append(data[begin:end, :].detach()) + return tuple(output) + + elif isinstance(inputs, list): + assert ( + len(inputs) == self.accumulate_steps + ), "length of data should be %d, but it is %d" % ( + self.accumulate_steps, + len(inputs), + ) + return inputs[cache_id].detach() + else: + self._check_data_vaild(inputs) + return inputs[begin:end, :].detach() + + def _load_micro_batch(self, cache_id): + inputs = self.data if self.is_pipeline_first_stage(): assert len(inputs) == 2, "length of input should be 2" - if isinstance(inputs[0], tuple): - assert ( - len(inputs[0]) > 1 - ), "If you use tuple for input data, it should have at least two inputs." - batch_size = inputs[0][0].shape[0] - assert ( - self.micro_batch_size * self.accumulate_steps == batch_size - ), ( - "batch_size needs to be divisible by micro_batch_size. Currently, " - "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d." - % (batch_size, self.micro_batch_size, self.accumulate_steps) - ) - data = [input[begin:end, :].detach() for input in inputs[0]] - return tuple(data) - else: - batch_size = inputs[0].shape[0] - assert ( - self.micro_batch_size * self.accumulate_steps == batch_size - ) - return inputs[0][begin:end, :].detach() + return self._load_micro_batch_impl(inputs[0], cache_id) elif self.is_pipeline_last_stage(): assert len(inputs) == 2, "length of input should be 2" - if isinstance(inputs[1], tuple): - batch_size = inputs[1][0].shape[0] - assert ( - self.micro_batch_size * self.accumulate_steps == batch_size - ) - data = [input[begin:end, :].detach() for input in inputs[1]] - return tuple(data) - else: - batch_size = inputs[1].shape[0] - assert ( - self.micro_batch_size * self.accumulate_steps == batch_size - ) - return inputs[1][begin:end, :].detach() + return self._load_micro_batch_impl(inputs[1], cache_id) else: - # No data input is required for other stages inputs = None def _broadcast_final_loss(self): diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py new file mode 100644 index 0000000000000..1db15407a5fad --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py @@ -0,0 +1,67 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +from hybrid_parallel_pp_transformer import ( + TestDistPPTraning, + set_random_seed, + ModelPipe, + batch_size, + length, + micro_batch_size, + vocab_size, +) + + +class TestDistPPTraningUnbalancedData(TestDistPPTraning): + def test_pp_model(self): + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + topology = hcg.topology() + set_random_seed(1024, dp_id, rank_id) + + model = ModelPipe(topology) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True + ) + optimizer = paddle.optimizer.SGD( + learning_rate=scheduler, parameters=model.parameters() + ) + + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + + for step_id in range(5): + x = [] + for _ in range(batch_size // micro_batch_size): + size = micro_batch_size + x_data = np.random.randint(0, vocab_size, size=[size, length]) + x.append(paddle.to_tensor(x_data)) + e_loss = model.eval_batch([x, x], True) + loss = model.train_batch([x, x], optimizer, scheduler) + + # TODO(shenliang03) add utest for loss + if pp_id != 0: + np.testing.assert_allclose(loss.numpy(), e_loss.numpy()) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py index f45104de32c8a..275c3721d66e7 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py @@ -64,6 +64,13 @@ def test_hybrid_parallel_pp_clip_grad(self): self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py') self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py', eager_mode=False) + def test_hybrid_parallel_transformer_unbalanced_data(self): + self.run_mnist_2gpu('hybrid_parallel_pp_transformer_unbalanced_data.py') + self.run_mnist_2gpu( + 'hybrid_parallel_pp_transformer_unbalanced_data.py', + eager_mode=False, + ) + if __name__ == "__main__": os.environ["FLAGS_enable_eager_mode"] = "1" From ad39043fa1b4c5ca907565de0fe3d1827388910b Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 2 Nov 2022 10:52:13 +0800 Subject: [PATCH 79/91] Improve the tool for checking nan and inf, and support to compute the max, min and mean of output tensor. (#47095) * Improve the tool for checking nan and inf, and support to compute the max, min and mean of output tensor. * Add a FLAGS to control whether abort when meets inf/nan and polish codes. * Fix unittest. * Change the computing of mean. --- .../framework/details/nan_inf_utils_detail.cu | 225 +++++++++++++++++- paddle/fluid/platform/flags.cc | 28 +++ .../fluid/tests/unittests/test_nan_inf.py | 2 +- 3 files changed, 247 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 57552a16cc5f4..163e5610030f6 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -12,15 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" +#include "paddle/fluid/framework/details/nan_inf_utils.h" + #include #include #include #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/details/nan_inf_utils.h" -#include "paddle/fluid/framework/details/nan_inf_utils_detail.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" + +DECLARE_bool(abort_on_nan_inf); +DECLARE_bool(check_tensor_max_min); namespace paddle { namespace framework { @@ -133,6 +139,171 @@ __global__ void CheckNanInfKernel(const T* value, PrintNanInfKernel(value, numel, print_num, debug_info); } +template < + typename T, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> +__device__ void BlockReduceMaxMinAndWrite(const T max_value, + const T min_value, + const T mean_value, + int64_t offset, + T* max_ptr, + T* min_ptr, + T* mean_ptr) { + // TODO(Xreki): support complex +} + +template < + typename T, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> +__device__ void BlockReduceMaxMinAndWrite(const T max_value, + const T min_value, + const T mean_value, + int64_t offset, + T* max_ptr, + T* min_ptr, + T* mean_ptr) { + if (max_ptr && min_ptr && mean_ptr) { + __syncthreads(); + + T block_max_value = phi::funcs::blockReduceMax(max_value, FINAL_MASK); + T block_min_value = phi::funcs::blockReduceMin(min_value, FINAL_MASK); + T block_mean_value = phi::funcs::blockReduceSum(mean_value, FINAL_MASK); + + if (threadIdx.x == 0) { + max_ptr[offset] = block_max_value; + min_ptr[offset] = block_min_value; + mean_ptr[offset] = block_mean_value; + } + } +} + +template +__global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, + const int64_t numel, + int* found_nan_inf_ptr, + MT* tensor_block_max_ptr, + MT* tensor_block_min_ptr, + MT* tensor_block_mean_ptr) { + bool has_nan = false; + bool has_inf = false; + + int64_t i = threadIdx.x + blockIdx.x * blockDim.x; + + MT max_value = static_cast(i < numel ? value_ptr[i] : value_ptr[0]); + MT min_value = static_cast(i < numel ? value_ptr[i] : value_ptr[0]); + MT mean_value = static_cast(0); + for (; i < numel; i += blockDim.x * gridDim.x) { + MT value = static_cast(value_ptr[i]); + + max_value = value > max_value ? value : max_value; + min_value = value < min_value ? value : min_value; + mean_value += value / static_cast(numel); + + if (isnan(value)) { + has_nan = true; + } + if (isinf(value)) { + has_inf = true; + } + + if (has_nan || has_inf) { + if (!tensor_block_max_ptr && !tensor_block_min_ptr && + !tensor_block_mean_ptr) { + break; + } + } + } + if (has_nan) { + found_nan_inf_ptr[0] = 1; + } + if (has_inf) { + found_nan_inf_ptr[1] = 1; + } + + BlockReduceMaxMinAndWrite(max_value, + min_value, + mean_value, + blockIdx.x, + tensor_block_max_ptr, + tensor_block_min_ptr, + tensor_block_mean_ptr); +} + +template +__global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, + const T* tensor_block_max_ptr, + const T* tensor_block_min_ptr, + const T* tensor_block_mean_ptr, + const char* debug_info, + int64_t numel, + int64_t numel_max_min, + bool abort_on_nan_inf, + bool check_tensor_max_min) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + int has_nan = found_nan_inf_ptr[0]; + int has_inf = found_nan_inf_ptr[1]; + + T max_value = static_cast(0); + T min_value = static_cast(0); + T mean_value = static_cast(0); + if (tensor_block_max_ptr && tensor_block_min_ptr && tensor_block_mean_ptr) { + max_value = tensor_block_max_ptr[0]; + min_value = tensor_block_min_ptr[0]; + mean_value = tensor_block_mean_ptr[0]; + + // numel_max_min <= 128 + for (int64_t i = 1; i < numel_max_min; ++i) { + T tmp_max_value = tensor_block_max_ptr[i]; + T tmp_min_value = tensor_block_min_ptr[i]; + T tmp_mean_value = tensor_block_mean_ptr[i]; + + max_value = tmp_max_value > max_value ? tmp_max_value : max_value; + min_value = tmp_min_value < min_value ? tmp_min_value : min_value; + mean_value += tmp_mean_value; + } + } + + if (has_nan || has_inf) { + if (abort_on_nan_inf) { + PADDLE_ENFORCE(false, + "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " + "find_inf=%d, " + "max=%e, min=%e, mean=%e===\n", + debug_info, + numel, + has_nan, + has_inf, + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } else { + printf( + "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " + "find_inf=%d, " + "max=%e, min=%e, mean=%e===\n", + debug_info, + numel, + has_nan, + has_inf, + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } + } else if (check_tensor_max_min) { + printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n", + debug_info, + numel, + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } + } +} + template <> template void TensorCheckerVisitor::apply( @@ -141,8 +312,6 @@ void TensorCheckerVisitor::apply( std::is_same>::value || std::is_same>::value>::type*) const { - int print_num = 3; - auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(tensor_.place())); int dev_id = tensor_.place().device; @@ -152,7 +321,12 @@ void TensorCheckerVisitor::apply( platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d", multi_op_var2gpu_str_mutex().size())); - std::string op_var = "[op=" + op_type_ + "] [tensor=" + var_name_ + "]"; + std::string dtype_str = DataTypeToString(DataTypeTrait::DataType()); + if (dtype_str == "::paddle::platform::float16") { + dtype_str = "float16"; + } + std::string op_var = "[op=" + op_type_ + "] [tensor=" + var_name_ + + "] [dtype=" + dtype_str + "]"; char* gpu_str_ptr = NULL; { @@ -212,6 +386,8 @@ void TensorCheckerVisitor::apply( std::min(static_cast(128), static_cast((tensor_.numel() + threads - 1) / threads)); #ifdef __HIPCC__ + int print_num = 3; + hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), @@ -222,8 +398,43 @@ void TensorCheckerVisitor::apply( print_num, gpu_str_ptr); #else - CheckNanInfKernel<<stream()>>>( - tensor_.data(), tensor_.numel(), print_num, gpu_str_ptr); + using MT = typename phi::dtype::MPTypeTrait::Type; + + phi::DenseTensor found_nan_inf; + found_nan_inf.Resize({2}); + int* found_nan_inf_ptr = found_nan_inf.mutable_data(tensor_.place()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( + found_nan_inf_ptr, 0, 2 * sizeof(int), dev_ctx->stream())); + + int64_t numel_max_min = blocks; + + phi::DenseTensor tensor_block_max_min; + tensor_block_max_min.Resize({static_cast(3 * numel_max_min)}); + MT* tensor_block_max_ptr = + tensor_block_max_min.mutable_data(tensor_.place()); + MT* tensor_block_min_ptr = tensor_block_max_ptr + numel_max_min; + MT* tensor_block_mean_ptr = tensor_block_max_ptr + 2 * numel_max_min; + + FindNanInfAndBlockMaxMin + <<stream()>>>(tensor_.data(), + tensor_.numel(), + found_nan_inf_ptr, + tensor_block_max_ptr, + tensor_block_min_ptr, + tensor_block_mean_ptr); + + bool abort_on_nan_inf = FLAGS_abort_on_nan_inf; + bool check_tensor_max_min = FLAGS_check_tensor_max_min; + FindGlobalMaxMinAndPrint + <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr, + tensor_block_max_ptr, + tensor_block_min_ptr, + tensor_block_mean_ptr, + gpu_str_ptr, + tensor_.numel(), + numel_max_min, + abort_on_nan_inf, + check_tensor_max_min); #endif } diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 23ecfecbbd2d9..1649c0c0c1404 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -68,6 +68,34 @@ PADDLE_DEFINE_EXPORTED_bool( "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); +/** + * Operator related FLAG + * Name: FLAGS_abort_on_nan_inf + * Since Version: 2.5.0 + * Value Range: bool, default=true + * Example: + * Note: Used to debug. Whether abort the process when any operator produce + * NAN/INF. It only works when FLAGS_check_nan_inf is set. + */ +PADDLE_DEFINE_EXPORTED_bool( + abort_on_nan_inf, + true, + "Whether abort the process when any operator produce NAN/INF or not."); + +/** + * Operator related FLAG + * Name: FLAGS_check_tensor_max_min + * Since Version: 2.5.0 + * Value Range: bool, default=false + * Example: + * Note: Used to debug. Enable to calculate and print the max and min value of + * each operator's output tensor. It only works when FLAGS_check_nan_inf is set. + */ +PADDLE_DEFINE_EXPORTED_bool( + check_tensor_max_min, + false, + "Whether to check all the output tensors's min and max value."); + /** * Operator related FLAG * Name: FLAGS_check_nan_inf diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index f062adb6a7a43..39c01f053f536 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -47,7 +47,7 @@ def check_nan_inf(self): # in python3, type(out+err) is 'bytes', need use encode if paddle.fluid.core.is_compiled_with_cuda(): - assert (out + err).find('find nan or inf==='.encode()) != -1 + assert (out + err).find('find_nan=1, find_inf=1'.encode()) != -1 else: assert (out + err).find( 'There are `nan` or `inf` in tensor'.encode() From 4325da3980f12e77f47dae37218a4577ddc7f80c Mon Sep 17 00:00:00 2001 From: risemeup1 <62429225+risemeup1@users.noreply.github.com> Date: Wed, 2 Nov 2022 13:44:44 +0800 Subject: [PATCH 80/91] Modify test file (#47544) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 修改.gitigore文件,把ljd_sh文件忽略掉 * 修复改动单测文件没有触发精准测试的问题 * 取消改动.gitignore * 修复改动单测没有出发精准测试的问题 * 修改变量名含义更加容易理解,test=coverage --- tools/get_pr_ut.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index fcfa68bb4da48..59f1108188499 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -278,18 +278,20 @@ def get_all_count(self): all_counts = line.split()[-1] return int(all_counts) - def file_is_unnit_test(self, filename): + def file_is_unnit_test(self, unittest_path): # get all testcases by ctest-N all_ut_file = '%s/build/all_ut_file' % PADDLE_ROOT os.system( "cd %s/build && ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' > %s" % (PADDLE_ROOT, all_ut_file) ) + (unittest_directory, unittest_name) = os.path.split(unittest_path) # determine whether filename is in all_ut_case with open(all_ut_file, 'r') as f: - (filepath, tempfilename) = os.path.split(filename) - for f_file in f: - if f_file.strip('\n') == tempfilename.split(".")[0]: + all_unittests = f.readlines() + for test in all_unittests: + test = test.replace('\n', '').strip() + if test == unittest_name.split(".")[0]: return True else: return False From 20db5221ece0329e62eb6e5a1c9664b0af6439ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=E5=90=B4=E5=98=89=E6=96=87?= <417333277@qq.com> Date: Wed, 2 Nov 2022 13:46:29 +0800 Subject: [PATCH 81/91] Remove redundant numpy import (#47483) --- python/paddle/incubate/nn/functional/fused_transformer.py | 1 - python/paddle/nn/layer/pooling.py | 2 -- python/paddle/optimizer/adagrad.py | 1 - python/paddle/regularizer.py | 1 - 4 files changed, 5 deletions(-) diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index dffddb8b9eca2..0887cd56aefe4 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -947,7 +947,6 @@ def fused_multi_transformer( # required: gpu import paddle import paddle.incubate.nn.functional as F - import numpy as np # input: [batch_size, seq_len, embed_dim] x = paddle.rand(shape=(2, 4, 128), dtype="float32") diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index a9b5af5199faf..3c3abe5e3903f 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -1171,7 +1171,6 @@ class MaxUnPool1D(Layer): import paddle import paddle.nn.functional as F - import numpy as np data = paddle.rand(shape=[1, 3, 16]) pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True) @@ -1351,7 +1350,6 @@ class MaxUnPool3D(Layer): import paddle import paddle.nn.functional as F - import numpy as np data = paddle.rand(shape=[1, 1, 4, 4, 6]) pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True) diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py index a4d9416e93bcc..522ca753a9976 100644 --- a/python/paddle/optimizer/adagrad.py +++ b/python/paddle/optimizer/adagrad.py @@ -70,7 +70,6 @@ class Adagrad(Optimizer): .. code-block:: python import paddle - import numpy as np inp = paddle.rand(shape=[10, 10]) linear = paddle.nn.Linear(10, 10) diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py index 395ec08a36848..38060b8233fdb 100644 --- a/python/paddle/regularizer.py +++ b/python/paddle/regularizer.py @@ -105,7 +105,6 @@ class L2Decay(fluid.regularizer.L2Decay): # Example1: set Regularizer in optimizer import paddle from paddle.regularizer import L2Decay - import numpy as np linear = paddle.nn.Linear(10, 10) inp = paddle.rand(shape=[10, 10], dtype="float32") out = linear(inp) From 623dce83f072b6ac86f1c95cb38f574ca8df0703 Mon Sep 17 00:00:00 2001 From: Leo Chen <39020268+leo0519@users.noreply.github.com> Date: Wed, 2 Nov 2022 13:56:42 +0800 Subject: [PATCH 82/91] Fix TRT UT failures (#47488) --- .../inference/test_trt_convert_elementwise.py | 6 ++-- .../inference/test_trt_convert_group_norm.py | 4 +-- .../ir/inference/test_trt_convert_pool2d.py | 34 +++++++++++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index 3699055475507..8420c9cdaae46 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -382,7 +382,7 @@ def sample_program_configs(self): def generate_input(shape): return np.random.random(shape).astype(np.float32) - for shape in [[4], [4, 32], [2, 64, 32], [1, 8, 16, 32]]: + for shape in [[4], [4, 32], [2, 32, 16], [1, 8, 16, 32]]: for op_type in [ "elementwise_add", "elementwise_mul", @@ -464,8 +464,8 @@ def generate_dynamic_shape(attrs): "input_data2": [128, 128, 256], } self.dynamic_shape.opt_input_shape = { - "input_data1": [2, 64, 64], - "input_data2": [2, 64, 64], + "input_data1": [2, 32, 16], + "input_data2": [2, 32, 16], } elif self.dims == 4: self.dynamic_shape.min_input_shape = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py index 18f40934172f6..cc4e719585c05 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py @@ -129,7 +129,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, False - ), 1e-3 + ), (1e-3, 1e-3) # for dynamic_shape generate_dynamic_shape(attrs) @@ -140,7 +140,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, True - ), 1e-3 + ), (1e-3, 1e-3) def add_skip_trt_case(self): pass diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py index 7bdaab0ee841c..cf57fd3bebe42 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py @@ -20,6 +20,7 @@ from typing import Any, Dict, List import unittest import itertools +import copy class TrtConvertPool2dTest(TrtLayerAutoScanTest): @@ -188,6 +189,39 @@ def teller(program_config, predictor_config): "The results of some cases are Nan, but the results of TensorRT and GPU are the same.", ) + def assert_tensors_near( + self, + atol: float, + rtol: float, + tensor: Dict[str, np.array], + baseline: Dict[str, np.array], + ): + for key, arr in tensor.items(): + self.assertEqual( + baseline[key].shape, + arr.shape, + 'The output shapes are not equal, the baseline shape is ' + + str(baseline[key].shape) + + ', but got ' + + str(arr.shape), + ) + + # The result of Pool2d may have some elements that is the least value (-65504 for FP16), + # but for FP32 and FP16 precision, their least value are different. + # We set a threshold that is the least value of FP16, + # and make the values less than the threshold to be the threshold. + def align_less_threshold(arr, threshold): + return np.clip(arr, threshold, None) + + fp16_min = np.finfo(np.float16).min + baseline_threshold = align_less_threshold( + copy.deepcopy(baseline[key]), fp16_min + ) + arr_threshold = align_less_threshold(copy.deepcopy(arr), fp16_min) + np.testing.assert_allclose( + baseline_threshold, arr_threshold, rtol=rtol, atol=atol + ) + def test(self): self.add_skip_trt_case() self.run_test() From cad2e68de2c8f3e405d753884bb1c47d74983e3b Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Wed, 2 Nov 2022 13:57:29 +0800 Subject: [PATCH 83/91] [Zero-Dim] support input 0D Tensor for some binary api (#46909) --- .../operators/common_infer_shape_functions.cc | 2 +- .../operators/elementwise/elementwise_npu.h | 2 +- paddle/phi/kernels/funcs/common_shape.h | 2 +- paddle/phi/kernels/funcs/elementwise_base.h | 4 +- .../phi/kernels/funcs/elementwise_grad_base.h | 4 +- paddle/phi/kernels/xpu/elementwise.h | 4 +- .../fluid/tests/unittests/test_bitwise_op.py | 60 +++++++- .../fluid/tests/unittests/test_compare_op.py | 48 ++++++ .../unittests/test_elementwise_add_op.py | 21 +++ .../unittests/test_elementwise_div_op.py | 36 +++++ .../unittests/test_elementwise_floordiv_op.py | 21 +++ .../unittests/test_elementwise_max_op.py | 30 ++++ .../unittests/test_elementwise_min_op.py | 30 ++++ .../unittests/test_elementwise_mod_op.py | 21 +++ .../unittests/test_elementwise_mul_op.py | 21 +++ .../unittests/test_elementwise_pow_op.py | 33 +++++ .../unittests/test_elementwise_sub_op.py | 30 ++++ .../fluid/tests/unittests/test_logical_op.py | 5 +- .../tests/unittests/test_zero_dim_shape.py | 140 ++++++++++++++++++ 19 files changed, 503 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc index 446a24a08b8cc..9dce94d16b4db 100644 --- a/paddle/fluid/operators/common_infer_shape_functions.cc +++ b/paddle/fluid/operators/common_infer_shape_functions.cc @@ -40,7 +40,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, platform::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, platform::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h index 5266491d6f506..45e5a548f91ed 100644 --- a/paddle/fluid/operators/elementwise/elementwise_npu.h +++ b/paddle/fluid/operators/elementwise/elementwise_npu.h @@ -123,7 +123,7 @@ void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx, platform::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, platform::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index 2daf8ab0bd97e..01b06120965fc 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -45,7 +45,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, phi::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, phi::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 100d2dcd612ce..29da617413853 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -326,7 +326,7 @@ void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx, phi::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, phi::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", @@ -394,7 +394,7 @@ void ElementwiseCompute(const CPUContext &dev_ctx, errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index 62889b530af99..e52c669c48d98 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -287,7 +287,7 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx, errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", @@ -1725,7 +1725,7 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", diff --git a/paddle/phi/kernels/xpu/elementwise.h b/paddle/phi/kernels/xpu/elementwise.h index 7c0c2a3497b00..dfaaae59bb3ce 100644 --- a/paddle/phi/kernels/xpu/elementwise.h +++ b/paddle/phi/kernels/xpu/elementwise.h @@ -51,7 +51,7 @@ void XPUElementwise(const XPUContext& dev_ctx, errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", @@ -121,7 +121,7 @@ void XPUElementwiseGrad(const XPUContext& dev_ctx, errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); - PADDLE_ENFORCE_LT(axis, + PADDLE_ENFORCE_LE(axis, max_dim, errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", diff --git a/python/paddle/fluid/tests/unittests/test_bitwise_op.py b/python/paddle/fluid/tests/unittests/test_bitwise_op.py index 6a7b039380b9e..bcd94392446c2 100644 --- a/python/paddle/fluid/tests/unittests/test_bitwise_op.py +++ b/python/paddle/fluid/tests/unittests/test_bitwise_op.py @@ -57,6 +57,24 @@ def init_bound(self): self.high = 100 +class TestBitwiseAnd_ZeroDim1(TestBitwiseAnd): + def init_shape(self): + self.x_shape = [] + self.y_shape = [] + + +class TestBitwiseAnd_ZeroDim2(TestBitwiseAnd): + def init_shape(self): + self.x_shape = [2, 3, 4, 5] + self.y_shape = [] + + +class TestBitwiseAnd_ZeroDim3(TestBitwiseAnd): + def init_shape(self): + self.x_shape = [] + self.y_shape = [2, 3, 4, 5] + + class TestBitwiseAndUInt8(TestBitwiseAnd): def init_dtype(self): self.dtype = np.uint8 @@ -143,6 +161,24 @@ def init_bound(self): self.high = 100 +class TestBitwiseOr_ZeroDim1(TestBitwiseOr): + def init_shape(self): + self.x_shape = [] + self.y_shape = [] + + +class TestBitwiseOr_ZeroDim2(TestBitwiseOr): + def init_shape(self): + self.x_shape = [2, 3, 4, 5] + self.y_shape = [] + + +class TestBitwiseOr_ZeroDim3(TestBitwiseOr): + def init_shape(self): + self.x_shape = [] + self.y_shape = [2, 3, 4, 5] + + class TestBitwiseOrUInt8(TestBitwiseOr): def init_dtype(self): self.dtype = np.uint8 @@ -229,6 +265,24 @@ def init_bound(self): self.high = 100 +class TestBitwiseXor_ZeroDim1(TestBitwiseXor): + def init_shape(self): + self.x_shape = [] + self.y_shape = [] + + +class TestBitwiseXor_ZeroDim2(TestBitwiseXor): + def init_shape(self): + self.x_shape = [2, 3, 4, 5] + self.y_shape = [] + + +class TestBitwiseXor_ZeroDim3(TestBitwiseXor): + def init_shape(self): + self.x_shape = [] + self.y_shape = [2, 3, 4, 5] + + class TestBitwiseXorUInt8(TestBitwiseXor): def init_dtype(self): self.dtype = np.uint8 @@ -311,6 +365,11 @@ def init_bound(self): self.high = 100 +class TestBitwiseNot_ZeroDim(TestBitwiseNot): + def init_shape(self): + self.x_shape = [] + + class TestBitwiseNotUInt8(TestBitwiseNot): def init_dtype(self): self.dtype = np.uint8 @@ -334,7 +393,6 @@ def init_dtype(self): def init_shape(self): self.x_shape = [2, 3, 4, 5] - self.y_shape = [4, 1] class TestBitwiseNotInt64(TestBitwiseNot): diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py index d9636972a13e3..c5b69f8c59af6 100755 --- a/python/paddle/fluid/tests/unittests/test_compare_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_op.py @@ -283,6 +283,54 @@ def test_dynamic_api_bool(self): self.assertEqual((out.numpy() == self.real_result).all(), True) paddle.enable_static() + def test_zero_dim_api_1(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.randint(-3, 3, shape=[], dtype='int32') + y = paddle.randint(-3, 3, shape=[], dtype='int32') + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + ( + x_np, + y_np, + res, + ) = exe.run(fetch_list=[x, y, out]) + real_result = callback(x_np, y_np) + self.assertEqual((res == real_result).all(), True) + + def test_zero_dim_api_2(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.randint(-3, 3, shape=[2, 3, 4], dtype='int32') + y = paddle.randint(-3, 3, shape=[], dtype='int32') + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + ( + x_np, + y_np, + res, + ) = exe.run(fetch_list=[x, y, out]) + real_result = callback(x_np, y_np) + self.assertEqual((res == real_result).all(), True) + + def test_zero_dim_api_3(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.randint(-3, 3, shape=[], dtype='int32') + y = paddle.randint(-3, 3, shape=[2, 3, 4], dtype='int32') + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + ( + x_np, + y_np, + res, + ) = exe.run(fetch_list=[x, y, out]) + real_result = callback(x_np, y_np) + self.assertEqual((res == real_result).all(), True) + def test_broadcast_api_1(self): paddle.enable_static() with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index 6bfd14dc84152..d9057ee4ca6ab 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -102,6 +102,27 @@ def init_axis(self): self.axis = -1 +class TestElementwiseAddOp_ZeroDim1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.out = np.add(self.x, self.y) + + +class TestElementwiseAddOp_ZeroDim2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + +class TestElementwiseAddOp_ZeroDim3(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.out = np.add(self.x, self.y) + + @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 16bf0df5af38b..7a0c5d09fbffc 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -112,6 +112,42 @@ def test_check_gradient(self): self.check_grad_with_place(*check_args, **check_kwargs) +class TestElementwiseDivOp_ZeroDim1(ElementwiseDivOp): + def init_shape(self): + self.x_shape = [] + self.y_shape = [] + + +class TestElementwiseDivOp_ZeroDim2(ElementwiseDivOp): + def init_shape(self): + self.x_shape = [13, 17] + self.y_shape = [] + + def compute_output(self, x, y): + return x / y.reshape([1, 1]) + + def compute_gradient_x(self, grad_out, y): + return grad_out / y.reshape([1, 1]) + + def compute_gradient_y(self, grad_out, out, y): + return np.sum(-1 * grad_out * out / y.reshape([1, 1])) + + +class TestElementwiseDivOp_ZeroDim3(ElementwiseDivOp): + def init_shape(self): + self.x_shape = [] + self.y_shape = [13, 17] + + def compute_output(self, x, y): + return x.reshape([1, 1]) / y + + def compute_gradient_x(self, grad_out, y): + return np.sum(grad_out / y) + + def compute_gradient_y(self, grad_out, out, y): + return -1 * grad_out * out / y + + @unittest.skipIf( not core.is_compiled_with_cuda() or not core.is_bfloat16_supported(core.CUDAPlace(0)), diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py index de058ed2b3b09..022d5929f1bab 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py @@ -57,6 +57,27 @@ def init_axis(self): pass +class TestElementwiseFloorDivOp_ZeroDim1(TestElementwiseModOp): + def init_input_output(self): + self.x = np.random.uniform(0, 10000, []).astype(self.dtype) + self.y = np.random.uniform(0, 1000, []).astype(self.dtype) + self.out = np.floor_divide(self.x, self.y) + + +class TestElementwiseFloorDivOp_ZeroDim2(TestElementwiseModOp): + def init_input_output(self): + self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype) + self.y = np.random.uniform(0, 1000, []).astype(self.dtype) + self.out = np.floor_divide(self.x, self.y) + + +class TestElementwiseFloorDivOp_ZeroDim3(TestElementwiseModOp): + def init_input_output(self): + self.x = np.random.uniform(0, 10000, []).astype(self.dtype) + self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype) + self.out = np.floor_divide(self.x, self.y) + + class TestElementwiseModOp_scalar(TestElementwiseModOp): def init_input_output(self): scale_x = random.randint(0, 100000000) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py index 018a44c2be964..671b5a942b8a8 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py @@ -55,6 +55,36 @@ def test_check_grad_ingore_y(self): ) +class TestElementwiseMaxOp_ZeroDim1(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_max" + self.python_api = paddle.maximum + x = np.random.uniform(0.1, 1, []).astype("float64") + y = np.random.uniform(0.1, 1, []).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwiseMaxOp_ZeroDim2(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_max" + self.python_api = paddle.maximum + x = np.random.uniform(0.1, 1, [13, 17]).astype("float64") + y = np.random.uniform(0.1, 1, []).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwiseMaxOp_ZeroDim3(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_max" + self.python_api = paddle.maximum + x = np.random.uniform(0.1, 1, []).astype("float64") + y = np.random.uniform(0.1, 1, [13, 17]).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + @unittest.skipIf( core.is_compiled_with_cuda() and ( diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py index 5a2cdc691faeb..1fe78b79fb059 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py @@ -58,6 +58,36 @@ def test_check_grad_ingore_y(self): ) +class TestElementwiseMinOp_ZeroDim1(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_min" + self.python_api = paddle.minimum + x = np.random.uniform(0.1, 1, []).astype("float64") + y = np.random.uniform(0.1, 1, []).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwiseMinOp_ZeroDim2(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_min" + self.python_api = paddle.minimum + x = np.random.uniform(0.1, 1, [13, 17]).astype("float64") + y = np.random.uniform(0.1, 1, []).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwiseMinOp_ZeroDim3(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_min" + self.python_api = paddle.minimum + x = np.random.uniform(0.1, 1, []).astype("float64") + y = np.random.uniform(0.1, 1, [13, 17]).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])} + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast." ) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py index 8969d76ce5165..9c9d2d91209f3 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py @@ -59,6 +59,27 @@ def init_axis(self): pass +class TestElementwiseModOp_ZeroDim1(TestElementwiseModOp): + def init_input_output(self): + self.x = np.random.uniform(0, 10000, []).astype(self.dtype) + self.y = np.random.uniform(0, 1000, []).astype(self.dtype) + self.out = np.mod(self.x, self.y) + + +class TestElementwiseModOp_ZeroDim2(TestElementwiseModOp): + def init_input_output(self): + self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype) + self.y = np.random.uniform(0, 1000, []).astype(self.dtype) + self.out = np.mod(self.x, self.y) + + +class TestElementwiseModOp_ZeroDim3(TestElementwiseModOp): + def init_input_output(self): + self.x = np.random.uniform(0, 10000, []).astype(self.dtype) + self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype) + self.out = np.mod(self.x, self.y) + + class TestElementwiseModOp_scalar(TestElementwiseModOp): def init_input_output(self): scale_x = random.randint(0, 100000000) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 987a17ff1f5ea..263fb8a998182 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -85,6 +85,27 @@ def init_axis(self): pass +class TestElementwiseMulOp_ZeroDim1(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.out = np.multiply(self.x, self.y) + + +class TestElementwiseMulOp_ZeroDim2(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.out = np.multiply(self.x, self.y) + + +class TestElementwiseMulOp_ZeroDim3(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, []).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.multiply(self.x, self.y) + + class TestBF16ElementwiseMulOp(OpTest): def setUp(self): self.op_type = "elementwise_mul" diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py index 53cb18f8aa33b..1d53dbdb2fa6a 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py @@ -48,6 +48,39 @@ def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out', check_eager=True) +class TestElementwisePowOp_ZeroDim1(TestElementwisePowOp): + def setUp(self): + self.op_type = "elementwise_pow" + self.python_api = paddle.pow + self.inputs = { + 'X': np.random.uniform(1, 2, []).astype("float64"), + 'Y': np.random.uniform(1, 2, []).astype("float64"), + } + self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwisePowOp_ZeroDim2(TestElementwisePowOp): + def setUp(self): + self.op_type = "elementwise_pow" + self.python_api = paddle.pow + self.inputs = { + 'X': np.random.uniform(1, 2, [20, 5]).astype("float64"), + 'Y': np.random.uniform(1, 2, []).astype("float64"), + } + self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwisePowOp_ZeroDim3(TestElementwisePowOp): + def setUp(self): + self.op_type = "elementwise_pow" + self.python_api = paddle.pow + self.inputs = { + 'X': np.random.uniform(1, 2, []).astype("float64"), + 'Y': np.random.uniform(1, 2, [20, 5]).astype("float64"), + } + self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])} + + class TestElementwisePowOp_big_shape_1(TestElementwisePowOp): def setUp(self): self.op_type = "elementwise_pow" diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index f8f050d6f6b08..d89b3b22aa3bb 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -46,6 +46,36 @@ def test_check_grad_ingore_y(self): ) +class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.uniform(0.1, 1, []).astype("float64"), + 'Y': np.random.uniform(0.1, 1, []).astype("float64"), + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"), + 'Y': np.random.uniform(0.1, 1, []).astype("float64"), + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.uniform(0.1, 1, []).astype("float64"), + 'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"), + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + class TestBF16ElementwiseOp(OpTest): def setUp(self): self.op_type = "elementwise_sub" diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py index 31490961c84ce..c05d99a4d9416 100755 --- a/python/paddle/fluid/tests/unittests/test_logical_op.py +++ b/python/paddle/fluid/tests/unittests/test_logical_op.py @@ -50,6 +50,9 @@ 'Axis1InLargerDim': {'x_shape': [1, 4, 5], 'y_shape': [2, 3, 1, 5]}, 'EqualDim1': {'x_shape': [10, 7], 'y_shape': [10, 7]}, 'EqualDim2': {'x_shape': [1, 1, 4, 5], 'y_shape': [2, 3, 1, 5]}, + 'ZeroDim1': {'x_shape': [], 'y_shape': []}, + 'ZeroDim2': {'x_shape': [2, 3, 4, 5], 'y_shape': []}, + 'ZeroDim3': {'x_shape': [], 'y_shape': [2, 3, 4, 5]}, } TEST_META_WRONG_SHAPE_DATA = { @@ -116,7 +119,7 @@ def np_data_generator(np_shape, dtype, *args, **kwargs): if dtype == bool: return np.random.choice(a=[True, False], size=np_shape).astype(bool) else: - return np.random.randn(*np_shape).astype(dtype) + return np.random.normal(0, 1, np_shape).astype(dtype) def test(unit_test, use_gpu=False, test_error=False): diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py b/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py index 0cab423aa7b98..90173712d422c 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_shape.py @@ -210,5 +210,145 @@ def test_static(self): paddle.disable_static() +binary_api_list = [ + {'func': paddle.add, 'cls_method': '__add__'}, + {'func': paddle.subtract, 'cls_method': '__sub__'}, + {'func': paddle.multiply, 'cls_method': '__mul__'}, + {'func': paddle.divide, 'cls_method': '__div__'}, + {'func': paddle.subtract, 'cls_method': '__sub__'}, + paddle.pow, +] + +binary_api_list_without_grad = [ + {'func': paddle.add, 'cls_method': '__add__'}, + {'func': paddle.subtract, 'cls_method': '__sub__'}, + {'func': paddle.multiply, 'cls_method': '__mul__'}, + {'func': paddle.divide, 'cls_method': '__div__'}, + {'func': paddle.subtract, 'cls_method': '__sub__'}, + paddle.pow, + {'func': paddle.mod, 'cls_method': '__mod__'}, + paddle.floor_mod, + paddle.remainder, + {'func': paddle.equal, 'cls_method': '__eq__'}, + {'func': paddle.not_equal, 'cls_method': '__ne__'}, + {'func': paddle.greater_equal, 'cls_method': '__ge__'}, + {'func': paddle.greater_than, 'cls_method': '__gt__'}, + {'func': paddle.less_equal, 'cls_method': '__le__'}, + {'func': paddle.less_than, 'cls_method': '__lt__'}, + paddle.logical_and, + paddle.logical_or, + paddle.logical_xor, +] + + +class TestBinaryAPI(unittest.TestCase): + def test_dygraph_binary(self): + paddle.disable_static() + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + for api in binary_api_list + binary_api_list_without_grad: + # 1) x/y is 0D + x = paddle.rand([]) + y = paddle.rand([]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) + np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) + else: + out = api(x, y) + + self.assertEqual(x.shape, []) + self.assertEqual(y.shape, []) + self.assertEqual(out.shape, []) + + if api not in binary_api_list_without_grad: + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(y.grad.shape, []) + self.assertEqual(out.grad.shape, []) + + # 2) x is not 0D , y is 0D + x = paddle.rand([2, 3, 4]) + y = paddle.rand([]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) + np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) + else: + out = api(x, y) + + self.assertEqual(x.shape, [2, 3, 4]) + self.assertEqual(y.shape, []) + self.assertEqual(out.shape, [2, 3, 4]) + + if api not in binary_api_list_without_grad: + out.backward() + self.assertEqual(x.grad.shape, [2, 3, 4]) + self.assertEqual(y.grad.shape, []) + self.assertEqual(out.grad.shape, [2, 3, 4]) + + # 3) x is 0D , y is not 0D + x = paddle.rand([]) + y = paddle.rand([2, 3, 4]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) + np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) + else: + out = api(x, y) + out.backward() + + self.assertEqual(x.shape, []) + self.assertEqual(y.shape, [2, 3, 4]) + self.assertEqual(out.shape, [2, 3, 4]) + + if api not in binary_api_list_without_grad: + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(y.grad.shape, [2, 3, 4]) + self.assertEqual(out.grad.shape, [2, 3, 4]) + + paddle.enable_static() + + def test_static_unary(self): + paddle.enable_static() + for api in binary_api_list: + main_prog = fluid.Program() + with fluid.program_guard(main_prog, fluid.Program()): + x = paddle.rand([]) + y = paddle.rand([]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + else: + out = api(x, y) + fluid.backward.append_backward(out) + + # append_backward always set grad shape to [1] + prog = paddle.static.default_main_program() + block = prog.global_block() + + # Test compile shape + self.assertEqual(x.shape, ()) + self.assertEqual(y.shape, ()) + self.assertEqual(out.shape, ()) + + exe = fluid.Executor() + result = exe.run(main_prog, fetch_list=[x, y, out]) + + # Test runtime shape + self.assertEqual(result[0].shape, ()) + self.assertEqual(result[1].shape, ()) + self.assertEqual(result[2].shape, ()) + + paddle.disable_static() + + if __name__ == "__main__": unittest.main() From fe8c679653b769f9b1f36c28d430cb04b899da72 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 2 Nov 2022 14:22:39 +0800 Subject: [PATCH 84/91] [PHI]Standardise some C++ API (Part3) (#47532) * Standardise batch norm * standardize conv3d and depwise_conv2d * fix ci bugs --- paddle/fluid/operators/inplace_abn_op.cc | 10 +-- paddle/fluid/operators/inplace_abn_op.cu | 17 ++--- paddle/phi/api/yaml/legacy_backward.yaml | 72 +++++++++---------- paddle/phi/api/yaml/legacy_ops.yaml | 12 ++-- paddle/phi/api/yaml/sparse_backward.yaml | 8 +-- paddle/phi/api/yaml/sparse_ops.yaml | 4 +- paddle/phi/infermeta/binary.cc | 6 -- paddle/phi/infermeta/binary.h | 6 -- paddle/phi/infermeta/multiary.cc | 18 +++-- paddle/phi/infermeta/multiary.h | 11 ++- paddle/phi/kernels/batch_norm_grad_kernel.h | 3 - paddle/phi/kernels/batch_norm_kernel.cc | 11 ++- paddle/phi/kernels/batch_norm_kernel.h | 11 ++- paddle/phi/kernels/conv_grad_kernel.h | 41 +++++------ paddle/phi/kernels/conv_kernel.h | 7 -- .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 4 -- paddle/phi/kernels/cpu/batch_norm_kernel.cc | 7 +- paddle/phi/kernels/cpu/conv_grad_kernel.cc | 45 +++++------- paddle/phi/kernels/cpu/conv_kernel.cc | 7 -- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 4 -- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 7 +- paddle/phi/kernels/gpu/conv_grad_kernel.cu | 3 - paddle/phi/kernels/gpu/conv_kernel.cu | 3 - .../kernels/gpu/depthwise_conv_grad_kernel.cu | 11 +-- .../phi/kernels/gpu/depthwise_conv_kernel.cu | 12 ++-- .../gpu/sync_batch_norm_grad_kernel.cu | 1 - .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 7 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 30 +++----- paddle/phi/kernels/gpudnn/conv_kernel.cu | 7 -- .../kernels/sparse/batch_norm_grad_kernel.cc | 2 - .../kernels/sparse/batch_norm_grad_kernel.h | 1 - .../phi/kernels/sparse/batch_norm_kernel.cc | 14 ++-- paddle/phi/kernels/sparse/batch_norm_kernel.h | 1 - .../sparse/gpu/sync_batch_norm_grad_kernel.cu | 2 - .../sparse/gpu/sync_batch_norm_kernel.cu | 14 ++-- .../sparse/sync_batch_norm_grad_kernel.h | 1 - .../kernels/sparse/sync_batch_norm_kernel.h | 7 +- .../phi/kernels/sync_batch_norm_grad_kernel.h | 1 - paddle/phi/kernels/sync_batch_norm_kernel.h | 7 +- .../phi/kernels/xpu/batch_norm_grad_kernel.cc | 1 - paddle/phi/kernels/xpu/batch_norm_kernel.cc | 9 ++- paddle/phi/kernels/xpu/conv_grad_kernel.cc | 4 -- paddle/phi/kernels/xpu/conv_kernel.cc | 4 -- paddle/phi/ops/compat/batch_norm_sig.cc | 17 ++--- paddle/phi/ops/compat/conv3d_sig.cc | 31 ++++---- paddle/phi/ops/compat/depthwise_conv2d_sig.cc | 23 ++---- paddle/phi/ops/compat/sync_batch_norm_sig.cc | 12 ++-- python/paddle/fluid/dygraph/nn.py | 7 +- python/paddle/nn/functional/conv.py | 7 -- python/paddle/nn/functional/norm.py | 7 +- python/paddle/nn/layer/norm.py | 7 +- python/paddle/sparse/nn/layer/norm.py | 14 ++-- 52 files changed, 217 insertions(+), 361 deletions(-) diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 110a0bf105ea2..f87d7effcae45 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -242,7 +242,6 @@ class InplaceABNKernel : public framework::OpKernel { auto is_test = ctx.Attr("is_test"); auto use_global_stats = ctx.Attr("use_global_stats"); auto trainable_statistics = ctx.Attr("trainable_statistics"); - auto fuse_with_relu = ctx.Attr("fuse_with_relu"); auto* mean_out = ctx.Output("MeanOut"); auto* variance_out = ctx.Output("VarianceOut"); @@ -255,17 +254,16 @@ class InplaceABNKernel : public framework::OpKernel { static_cast::TYPE&>(dev_ctx), *x, - *scale, - *bias, *mean, *variance, + *scale, + *bias, + is_test, momentum, epsilon, data_layout, - is_test, use_global_stats, trainable_statistics, - fuse_with_relu, y, mean_out, variance_out, @@ -315,7 +313,6 @@ class InplaceABNGradKernel : public framework::OpKernel { auto is_test = ctx.Attr("is_test"); auto use_global_stats = ctx.Attr("use_global_stats"); auto trainable_statistics = ctx.Attr("trainable_statistics"); - auto fuse_with_relu = ctx.Attr("fuse_with_relu"); auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); @@ -361,7 +358,6 @@ class InplaceABNGradKernel : public framework::OpKernel { is_test, use_global_stats, trainable_statistics, - fuse_with_relu, true, d_x, scale_grad, diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index 0ee6d686a7539..e1131822f289e 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -48,7 +48,6 @@ class InplaceABNKernel : public framework::OpKernel { auto is_test = ctx.Attr("is_test"); auto use_global_stats = ctx.Attr("use_global_stats"); auto trainable_statistics = ctx.Attr("trainable_statistics"); - auto fuse_with_relu = ctx.Attr("fuse_with_relu"); auto* mean_out = ctx.Output("MeanOut"); auto* variance_out = ctx.Output("VarianceOut"); @@ -62,17 +61,16 @@ class InplaceABNKernel : public framework::OpKernel { static_cast::TYPE&>(dev_ctx), *x, - *scale, - *bias, *mean, *variance, + *scale, + *bias, + is_test, momentum, epsilon, data_layout, - is_test, use_global_stats, trainable_statistics, - fuse_with_relu, y, mean_out, variance_out, @@ -85,17 +83,16 @@ class InplaceABNKernel : public framework::OpKernel { static_cast::TYPE&>(dev_ctx), *x, - *scale, - *bias, *mean, *variance, + *scale, + *bias, + is_test, momentum, epsilon, data_layout, - is_test, use_global_stats, trainable_statistics, - fuse_with_relu, y, mean_out, variance_out, @@ -146,7 +143,6 @@ class InplaceABNGradKernel : public framework::OpKernel { auto is_test = ctx.Attr("is_test"); auto use_global_stats = ctx.Attr("use_global_stats"); auto trainable_statistics = ctx.Attr("trainable_statistics"); - auto fuse_with_relu = ctx.Attr("fuse_with_relu"); auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); @@ -210,7 +206,6 @@ class InplaceABNGradKernel : public framework::OpKernel { is_test, use_global_stats, trainable_statistics, - fuse_with_relu, true, d_x, scale_grad, diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index d4eade92eeda2..c4b16f73101e1 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -129,8 +129,8 @@ inplace : (out_grad -> x_grad) - backward_op : batch_norm_double_grad - forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias) - args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias) + args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics) output : Tensor(x_grad), Tensor(scale_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -142,8 +142,8 @@ inplace : (grad_out -> grad_out_grad) - backward_op : batch_norm_grad - forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + forward : batch_norm (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics) output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -345,9 +345,21 @@ use_gpudnn : true backward : conv2d_transpose_double_grad +- backward_op : conv3d_double_grad + forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) + output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param: [input, filter, grad_out] + kernel : + func : conv3d_double_grad + use_gpudnn : true + optional : grad_input_grad, grad_filter_grad + - backward_op : conv3d_grad - forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -355,19 +367,7 @@ kernel : func : conv3d_grad use_gpudnn : true - backward : conv3d_grad_grad - -- backward_op : conv3d_grad_grad - forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) - output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) - infer_meta : - func : GeneralTernaryGradInferMeta - param: [input, filter, grad_out] - kernel : - func : conv3d_grad_grad - use_gpudnn : true - optional : grad_input_grad, grad_filter_grad + backward : conv3d_double_grad - backward_op : conv3d_transpose_grad forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) @@ -427,29 +427,29 @@ data_type : x optional : mask +- backward_op : depthwise_conv2d_double_grad + forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) + output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param: [input, filter, grad_out] + kernel : + func : depthwise_conv2d_double_grad + optional : grad_input_grad, grad_filter_grad + - backward_op : depthwise_conv2d_grad - forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) + forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta param : [input, filter] kernel : func : depthwise_conv2d_grad - param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu] + param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format] use_gpudnn : use_gpudnn - backward : depthwise_conv2d_grad_grad - -- backward_op : depthwise_conv2d_grad_grad - forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) - output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) - infer_meta : - func : GeneralTernaryGradInferMeta - param: [input, filter, grad_out] - kernel : - func : depthwise_conv2d_grad_grad - optional : grad_input_grad, grad_filter_grad + backward : depthwise_conv2d_double_grad - backward_op : depthwise_conv2d_transpose_grad forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) @@ -2091,8 +2091,8 @@ inplace : (out_grad -> x_grad) - backward_op : sync_batch_norm_grad - forward : sync_batch_norm_ (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) - args : (Tensor x, Tensor scale, Tensor bias, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + forward : sync_batch_norm_ (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + args : (Tensor x, Tensor scale, Tensor bias, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics) output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) infer_meta : func : GeneralTernaryGradInferMeta diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 3c3e050b38e7b..0f118add06abb 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -254,7 +254,7 @@ inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) - op : batch_norm - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) infer_meta: func : BatchNormInferMeta @@ -464,7 +464,7 @@ backward : conv2d_transpose_grad - op : conv3d - args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor infer_meta : func : Conv3DInferMeta @@ -551,14 +551,14 @@ backward : deformable_conv_grad - op : depthwise_conv2d - args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) + args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) output : Tensor(out) infer_meta : func : DepthwiseConvInferMeta - param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search] + param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format] kernel : func : depthwise_conv2d - param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu] + param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format] use_gpudnn : use_gpudnn backward : depthwise_conv2d_grad @@ -2373,7 +2373,7 @@ backward : swish_grad - op : sync_batch_norm_ - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) infer_meta : func : BatchNormInferMeta diff --git a/paddle/phi/api/yaml/sparse_backward.yaml b/paddle/phi/api/yaml/sparse_backward.yaml index 72c4cc61eea45..b0a4a97ca5f63 100644 --- a/paddle/phi/api/yaml/sparse_backward.yaml +++ b/paddle/phi/api/yaml/sparse_backward.yaml @@ -101,8 +101,8 @@ atanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr} - backward_op : batch_norm_grad - forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + forward : batch_norm (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics) output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -368,8 +368,8 @@ subtract_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} - backward_op : sync_batch_norm_grad - forward : sync_batch_norm_(Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) - args : (Tensor x, Tensor scale, Tensor bias, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + forward : sync_batch_norm_(Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) + args : (Tensor x, Tensor scale, Tensor bias, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics) output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) infer_meta : func : GeneralTernaryGradInferMeta diff --git a/paddle/phi/api/yaml/sparse_ops.yaml b/paddle/phi/api/yaml/sparse_ops.yaml index 12965ce9fbe2c..6e6fd07a240fb 100644 --- a/paddle/phi/api/yaml/sparse_ops.yaml +++ b/paddle/phi/api/yaml/sparse_ops.yaml @@ -88,7 +88,7 @@ backward : atanh_grad - op : batch_norm - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) infer_meta : func : BatchNormInferMeta @@ -324,7 +324,7 @@ backward : subtract_grad - op : sync_batch_norm_ - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) + args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) infer_meta : func : BatchNormInferMeta diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 48e72f503bf3c..578bfc37cdf68 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -564,9 +564,6 @@ void Conv3DInferMeta(const MetaTensor& input, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config) { ConvInferMeta(input, @@ -927,9 +924,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config) { ConvInferMeta(input, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 30e22cb3f56a6..2d3bbf516f236 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -95,9 +95,6 @@ void Conv3DInferMeta(const MetaTensor& input, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config = MetaConfig()); @@ -151,9 +148,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config = MetaConfig()); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 52050f160e24e..c90c3a54de06f 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -534,17 +534,16 @@ void AverageAccumulatesInferMeta(const MetaTensor& param, } void BatchNormInferMeta(const MetaTensor& x, - const MetaTensor& scale, - const MetaTensor& bias, const MetaTensor& mean, const MetaTensor& variance, + const MetaTensor& scale, + const MetaTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout_str, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, MetaTensor* y, MetaTensor* mean_out, MetaTensor* variance_out, @@ -646,10 +645,10 @@ void BatchNormInferMeta(const MetaTensor& x, } void BatchNormInferInferMeta(const MetaTensor& x, - const MetaTensor& scale, - const MetaTensor& bias, const MetaTensor& mean, const MetaTensor& variance, + const MetaTensor& scale, + const MetaTensor& bias, float momentum, float epsilon, const std::string& data_layout, @@ -658,17 +657,16 @@ void BatchNormInferInferMeta(const MetaTensor& x, MetaTensor* variance_out, MetaConfig config) { BatchNormInferMeta(x, - scale, - bias, mean, variance, + scale, + bias, + /*is_test=*/true, momentum, epsilon, data_layout, - /*is_test=*/true, /*use_global_stats=*/false, /*trainable_statistics=*/false, - /*fuse_with_relu=*/false, y, mean_out, variance_out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 79926e06b2b2e..a37925202926a 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -158,17 +158,16 @@ void AverageAccumulatesInferMeta(const MetaTensor& param, MetaTensor* out_num_updates); void BatchNormInferMeta(const MetaTensor& x, - const MetaTensor& scale, - const MetaTensor& bias, const MetaTensor& mean, const MetaTensor& variance, + const MetaTensor& scale, + const MetaTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, MetaTensor* y, MetaTensor* mean_out, MetaTensor* variance_out, @@ -178,10 +177,10 @@ void BatchNormInferMeta(const MetaTensor& x, MetaConfig config = MetaConfig()); void BatchNormInferInferMeta(const MetaTensor& x, - const MetaTensor& scale, - const MetaTensor& bias, const MetaTensor& mean, const MetaTensor& variance, + const MetaTensor& scale, + const MetaTensor& bias, float momentum, float epsilon, const std::string& data_layout, diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h index afbb0c78ca981..24e23e8d69074 100644 --- a/paddle/phi/kernels/batch_norm_grad_kernel.h +++ b/paddle/phi/kernels/batch_norm_grad_kernel.h @@ -37,7 +37,6 @@ void BatchNormGradRawKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, bool is_inplace, DenseTensor* x_grad, DenseTensor* scale_grad, @@ -60,7 +59,6 @@ void BatchNormGradKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad); @@ -83,7 +81,6 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* y_grad_grad); diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc index 623b4c1cc745b..eddd65184fe93 100644 --- a/paddle/phi/kernels/batch_norm_kernel.cc +++ b/paddle/phi/kernels/batch_norm_kernel.cc @@ -22,10 +22,10 @@ namespace phi { template void BatchNormInferKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, float momentum, float epsilon, const std::string& data_layout, @@ -39,17 +39,16 @@ void BatchNormInferKernel(const Context& dev_ctx, auto saved_variance = phi::EmptyLike(dev_ctx, *variance_out); BatchNormKernel(dev_ctx, x, - scale, - bias, mean, variance, + scale, + bias, + /*is_test=*/true, momentum, epsilon, data_layout, - /*is_test=*/true, /*use_global_stats=*/false, /*trainable_statistics=*/false, - /*fuse_with_relu=*/false, y, mean_out, variance_out, diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h index be589e43647c1..219b52894f988 100644 --- a/paddle/phi/kernels/batch_norm_kernel.h +++ b/paddle/phi/kernels/batch_norm_kernel.h @@ -23,17 +23,16 @@ namespace phi { template void BatchNormKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, @@ -44,10 +43,10 @@ void BatchNormKernel(const Context& dev_ctx, template void BatchNormInferKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, float momentum, float epsilon, const std::string& data_layout, diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h index 4164db5f8a01b..8eb67862f80c2 100644 --- a/paddle/phi/kernels/conv_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -43,9 +43,6 @@ void Conv3DGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad); @@ -60,10 +57,6 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* input_grad, DenseTensor* filter_grad); @@ -85,23 +78,21 @@ void ConvGradGradKernel(const Context& dev_ctx, DenseTensor* out_grad_grad); template -void Conv3DGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad); +void Conv3DDoubleGradKernel( + const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad); } // namespace phi diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h index 06faee6d3aa1e..a19ad0c949eaa 100644 --- a/paddle/phi/kernels/conv_kernel.h +++ b/paddle/phi/kernels/conv_kernel.h @@ -40,9 +40,6 @@ void Conv3DKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out); template @@ -55,10 +52,6 @@ void DepthwiseConvKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc index cc5541b7e54e9..f2054d4d396c6 100644 --- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -52,7 +52,6 @@ void BatchNormGradRawKernel(const Context& ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, bool is_inplace, DenseTensor* x_grad, DenseTensor* scale_grad, @@ -310,7 +309,6 @@ void BatchNormGradKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { @@ -330,7 +328,6 @@ void BatchNormGradKernel(const Context& dev_ctx, is_test, use_global_stats, trainable_statistics, - fuse_with_relu, false, x_grad, scale_grad, @@ -355,7 +352,6 @@ void BatchNormDoubleGradKernel(const Context& ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* y_grad_grad) { diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc index e3a307c4b379b..332df1d9f137e 100644 --- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc @@ -36,17 +36,16 @@ using ConstEigenVectorArrayMap = template void BatchNormKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout_str, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 06a63267c5c96..273b04df887fd 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -31,10 +31,6 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvGradKernel(dev_ctx, @@ -62,9 +58,6 @@ void Conv3DGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvGradKernel(dev_ctx, @@ -82,24 +75,22 @@ void Conv3DGradKernel(const Context& dev_ctx, } template -void Conv3DGradGradKernel(const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { +void Conv3DDoubleGradKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { ConvGradGradKernel(ctx, input, filter, @@ -136,9 +127,9 @@ PD_REGISTER_KERNEL( conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { } -PD_REGISTER_KERNEL(conv3d_grad_grad, +PD_REGISTER_KERNEL(conv3d_double_grad, CPU, ALL_LAYOUT, - phi::Conv3DGradGradKernel, + phi::Conv3DDoubleGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc index 12bfa852d96c9..dffa4639f0118 100644 --- a/paddle/phi/kernels/cpu/conv_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -53,10 +53,6 @@ void DepthwiseConvKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* out) { ConvKernelImpl(dev_ctx, input, @@ -80,9 +76,6 @@ void Conv3DKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out) { ConvKernelImpl(dev_ctx, input, diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index 5a3c04c418e0b..be78b741c7269 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -578,7 +578,6 @@ void BatchNormGradRawKernel(const Context &ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, bool is_inplace, DenseTensor *x_grad, DenseTensor *scale_grad, @@ -1262,7 +1261,6 @@ void BatchNormGradKernel(const Context &dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor *x_grad, DenseTensor *scale_grad, DenseTensor *bias_grad) { @@ -1282,7 +1280,6 @@ void BatchNormGradKernel(const Context &dev_ctx, is_test, use_global_stats, trainable_statistics, - fuse_with_relu, false, x_grad, scale_grad, @@ -1307,7 +1304,6 @@ void BatchNormDoubleGradKernel(const Context &ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor *x_grad, DenseTensor *scale_grad, DenseTensor *y_grad_grad) { diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 01a58e101f60e..a90d85dc2886f 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -533,17 +533,16 @@ static __global__ void BNForwardTraining2DWriteRes( template void BatchNormKernel(const Context &ctx, const DenseTensor &x, - const DenseTensor &scale, - const DenseTensor &bias, const DenseTensor &mean, const DenseTensor &variance, + const DenseTensor &scale, + const DenseTensor &bias, + bool is_test, float momentum, float epsilon_f, const std::string &data_layout_str, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor *y, DenseTensor *mean_out, DenseTensor *variance_out, diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu index 4f15030365a6c..037a619b7a29b 100644 --- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -31,9 +31,6 @@ void Conv3DGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvGradKernel(dev_ctx, diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu index a089175c96fb6..d84c7f7be45b9 100644 --- a/paddle/phi/kernels/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_kernel.cu @@ -53,9 +53,6 @@ void Conv3DKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out) { ConvKernelImpl(dev_ctx, input, diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu index fadc21dfb34c4..2e815b3e455d5 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -33,16 +33,19 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations_t, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* input_grad, DenseTensor* filter_grad) { const DenseTensor* output_grad = &out_grad; if (!input_grad && !filter_grad) return; + bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv"); + bool fuse_relu = + has_fuse_relu + ? PADDLE_GET_CONST( + bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv")) + : false; + std::vector strides = strides_t; std::vector paddings = paddings_t; std::vector dilations = dilations_t; diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu index a26dfd44691e2..18e6913be1f1e 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -31,10 +31,6 @@ void DepthwiseConvKernel(const Context& dev_ctx, int groups, const std::vector& dilations_t, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* out) { DenseTensor* output = out; output->mutable_data(dev_ctx.GetPlace()); @@ -44,6 +40,14 @@ void DepthwiseConvKernel(const Context& dev_ctx, std::vector paddings = paddings_t; const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv"); + bool fuse_relu = + has_fuse_relu + ? PADDLE_GET_CONST( + bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv")) + : false; + if (channel_last) { PADDLE_ENFORCE_EQ( output->dims()[output->dims().size() - 1] % diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu index 5cf7aabda303a..84d3f3c972ad9 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu @@ -34,7 +34,6 @@ void SyncBatchNormGradKernel(const Context& ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index d41f50677fdf5..053571bd9a5aa 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -22,17 +22,16 @@ namespace phi { template void SyncBatchNormKernel(const Context &ctx, const DenseTensor &x, - const DenseTensor &scale, - const DenseTensor &bias, const DenseTensor &mean, const DenseTensor &variance, + const DenseTensor &scale, + const DenseTensor &bias, + bool is_test, float momentum, float epsilon_f, const std::string &data_layout_str, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor *y, DenseTensor *mean_out, DenseTensor *variance_out, diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index dcd1e133c729d..a69345963324d 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -603,9 +603,6 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvCudnnGradKernel(dev_ctx, @@ -1295,10 +1292,6 @@ void DepthwiseConvDoubleGradGPUDNNKernel( int groups, const std::vector& dilations_t, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - bool fuse_relu, DenseTensor* input_grad, DenseTensor* filter_grad, DenseTensor* out_grad_grad) { @@ -1320,7 +1313,7 @@ void DepthwiseConvDoubleGradGPUDNNKernel( } template -void Conv3DCudnnGradGradKernel( +void Conv3DCudnnDoubleGradKernel( const Context& ctx, const DenseTensor& input, const DenseTensor& filter, @@ -1333,9 +1326,6 @@ void Conv3DCudnnGradGradKernel( int groups, const std::vector& dilations_t, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, DenseTensor* input_grad, DenseTensor* filter_grad, DenseTensor* out_grad_grad) { @@ -1386,14 +1376,14 @@ PD_REGISTER_KERNEL(conv2d_grad_grad, float, phi::dtype::float16) {} -PD_REGISTER_KERNEL(conv3d_grad_grad, +PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, + phi::Conv3DCudnnDoubleGradKernel, float, phi::dtype::float16) {} -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, +PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, GPU, ALL_LAYOUT, phi::DepthwiseConvDoubleGradGPUDNNKernel, @@ -1427,16 +1417,16 @@ PD_REGISTER_KERNEL(conv2d_grad_grad, phi::dtype::float16, phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(conv3d_grad_grad, +PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, + phi::Conv3DCudnnDoubleGradKernel, float, double, phi::dtype::float16, phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, +PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, GPU, ALL_LAYOUT, phi::DepthwiseConvDoubleGradGPUDNNKernel, @@ -1469,15 +1459,15 @@ PD_REGISTER_KERNEL(conv2d_grad_grad, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL(conv3d_grad_grad, +PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, + phi::Conv3DCudnnDoubleGradKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, +PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, GPU, ALL_LAYOUT, phi::DepthwiseConvDoubleGradGPUDNNKernel, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 3d1e7776ba394..a393cc278f2c2 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -397,9 +397,6 @@ void Conv3DCudnnKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out) { ConvCudnnKernel(dev_ctx, input, @@ -423,10 +420,6 @@ void DepthwiseConvCudnnKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* out) { ConvCudnnKernel(dev_ctx, input, diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc index f9a96b15eedfe..ff3173ec0a101 100644 --- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc @@ -38,7 +38,6 @@ void BatchNormCooGradKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { @@ -61,7 +60,6 @@ void BatchNormCooGradKernel(const Context& dev_ctx, is_test, use_global_stats, trainable_statistics, - fuse_with_relu, x_grad->mutable_values(), scale_grad, bias_grad); diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h index b7051683170e6..90f5e5b00ad3f 100644 --- a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h +++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h @@ -39,7 +39,6 @@ void BatchNormCooGradKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad); diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_kernel.cc index 4f925e83a9b69..04ab36892513c 100644 --- a/paddle/phi/kernels/sparse/batch_norm_kernel.cc +++ b/paddle/phi/kernels/sparse/batch_norm_kernel.cc @@ -23,17 +23,16 @@ namespace sparse { template void BatchNormCooKernel(const Context& dev_ctx, const SparseCooTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, @@ -43,17 +42,16 @@ void BatchNormCooKernel(const Context& dev_ctx, EmptyLikeCooKernel(dev_ctx, x, y); phi::BatchNormKernel(dev_ctx, x.values(), - scale, - bias, mean, variance, + scale, + bias, + is_test, momentum, epsilon, data_layout, - is_test, use_global_stats, trainable_statistics, - fuse_with_relu, y->mutable_values(), mean_out, variance_out, diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.h b/paddle/phi/kernels/sparse/batch_norm_kernel.h index 282a8de7b39d4..03e621cc65325 100644 --- a/paddle/phi/kernels/sparse/batch_norm_kernel.h +++ b/paddle/phi/kernels/sparse/batch_norm_kernel.h @@ -35,7 +35,6 @@ void BatchNormKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, diff --git a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu index e0805578a0f86..664b3a1ee2699 100644 --- a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu @@ -37,7 +37,6 @@ void SyncBatchNormCooGradKernel( bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { @@ -58,7 +57,6 @@ void SyncBatchNormCooGradKernel( is_test, use_global_stats, trainable_statistics, - fuse_with_relu, x_grad->mutable_values(), scale_grad, bias_grad); diff --git a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu index a518148f2c95b..162f1f4b93765 100644 --- a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu @@ -23,17 +23,16 @@ namespace sparse { template void SyncBatchNormCooKernel(const Context& dev_ctx, const SparseCooTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, @@ -43,17 +42,16 @@ void SyncBatchNormCooKernel(const Context& dev_ctx, EmptyLikeCooKernel(dev_ctx, x, y); phi::SyncBatchNormKernel(dev_ctx, x.values(), - scale, - bias, mean, variance, + scale, + bias, + is_test, momentum, epsilon, data_layout, - is_test, use_global_stats, trainable_statistics, - fuse_with_relu, y->mutable_values(), mean_out, variance_out, diff --git a/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h b/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h index 9591e6f035ca7..533ad99a7f088 100644 --- a/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h +++ b/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h @@ -38,7 +38,6 @@ void SyncBatchNormCooGradKernel( bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad); diff --git a/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h b/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h index 7ee4baa107971..09b5a8445dba3 100644 --- a/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h +++ b/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h @@ -25,17 +25,16 @@ namespace sparse { template void SyncBatchNormCooKernel(const Context& dev_ctx, const SparseCooTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, SparseCooTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, diff --git a/paddle/phi/kernels/sync_batch_norm_grad_kernel.h b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h index a38f42c29f62d..b070a1e409ae1 100644 --- a/paddle/phi/kernels/sync_batch_norm_grad_kernel.h +++ b/paddle/phi/kernels/sync_batch_norm_grad_kernel.h @@ -35,7 +35,6 @@ void SyncBatchNormGradKernel(const Context& dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad); diff --git a/paddle/phi/kernels/sync_batch_norm_kernel.h b/paddle/phi/kernels/sync_batch_norm_kernel.h index a4909deb648cf..761ce1503a22a 100644 --- a/paddle/phi/kernels/sync_batch_norm_kernel.h +++ b/paddle/phi/kernels/sync_batch_norm_kernel.h @@ -37,17 +37,16 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid = 0); template void SyncBatchNormKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, const std::string& data_layout, - bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, diff --git a/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc index 6760eadf019e4..743fb9bc3fffc 100644 --- a/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc @@ -86,7 +86,6 @@ void BatchNormGradKernel(const Context &dev_ctx, bool is_test, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor *x_grad, DenseTensor *scale_grad, DenseTensor *bias_grad) { diff --git a/paddle/phi/kernels/xpu/batch_norm_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_kernel.cc index b73879f709f01..2e4ec7103a35d 100644 --- a/paddle/phi/kernels/xpu/batch_norm_kernel.cc +++ b/paddle/phi/kernels/xpu/batch_norm_kernel.cc @@ -23,17 +23,16 @@ namespace phi { template void BatchNormKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& scale, - const DenseTensor& bias, const DenseTensor& mean, const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, + bool is_test, float momentum, float epsilon, - const std::string& data_layout_str, - bool is_test, + const std::string& data_layout, bool use_global_stats, bool trainable_statistics, - bool fuse_with_relu, DenseTensor* y, DenseTensor* mean_out, DenseTensor* variance_out, diff --git a/paddle/phi/kernels/xpu/conv_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_grad_kernel.cc index ad97d86e916fa..de4c573b375f6 100644 --- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc @@ -152,10 +152,6 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* input_grad, DenseTensor* filter_grad) { ConvGradKernel(dev_ctx, diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc index 05f5f939187c4..8bbbdc2c16d8b 100644 --- a/paddle/phi/kernels/xpu/conv_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_kernel.cc @@ -118,10 +118,6 @@ void DepthwiseConvKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - bool fuse_relu, DenseTensor* out) { ConvKernel(dev_ctx, input, diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc index 1c6b63d70c705..ff7a582142513 100644 --- a/paddle/phi/ops/compat/batch_norm_sig.cc +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -33,19 +33,18 @@ KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { if (is_test && !use_global_stats && !trainable_statistics && !fuse_with_relu) { return KernelSignature("batch_norm_infer", - {"X", "Scale", "Bias", "Mean", "Variance"}, + {"X", "Mean", "Variance", "Scale", "Bias"}, {"momentum", "epsilon", "data_layout"}, {"Y", "MeanOut", "VarianceOut"}); } else { return KernelSignature("batch_norm", - {"X", "Scale", "Bias", "Mean", "Variance"}, - {"momentum", + {"X", "Mean", "Variance", "Scale", "Bias"}, + {"is_test", + "momentum", "epsilon", "data_layout", - "is_test", "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, + "trainable_statistics"}, {"Y", "MeanOut", "VarianceOut", @@ -74,8 +73,7 @@ KernelSignature BatchNormGradOpArgumentMapping( "data_layout", "is_test", "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, + "trainable_statistics"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); } @@ -97,8 +95,7 @@ KernelSignature BatchNormGradGradOpArgumentMapping( "data_layout", "is_test", "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, + "trainable_statistics"}, {"DX", "DScale", "DDY"}); } diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc index 68bd54609cb03..f08c2eb3ec228 100644 --- a/paddle/phi/ops/compat/conv3d_sig.cc +++ b/paddle/phi/ops/compat/conv3d_sig.cc @@ -19,15 +19,14 @@ namespace phi { KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("conv3d", {"Input", "Filter"}, - {"strides", - "paddings", - "padding_algorithm", - "groups", - "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + { + "strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + }, {"Output"}); } @@ -39,31 +38,27 @@ KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { "padding_algorithm", "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + "data_format"}, {"Input@GRAD", "Filter@GRAD"}); } KernelSignature Conv3dDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("conv3d_grad_grad", + return KernelSignature("conv3d_double_grad", {"Input", "Filter", "DOutput", "DDInput", "DDFilter"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + "data_format"}, {"DInput", "DFilter", "DDOutput"}); } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(conv3d_grad_grad, conv3d_double_grad); + PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad, diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc index f7558499fd8fc..08ff91c2cae5f 100644 --- a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc +++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc @@ -25,11 +25,7 @@ KernelSignature DepthwiseConv2dOpArgumentMapping( "padding_algorithm", "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search", - "fuse_relu_before_depthwise_conv"}, + "data_format"}, {"Output"}); } @@ -42,33 +38,28 @@ KernelSignature DepthwiseConv2dGradOpArgumentMapping( "padding_algorithm", "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search", - "fuse_relu_before_depthwise_conv"}, + "data_format"}, {"Input@GRAD", "Filter@GRAD"}); } KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("depthwise_conv2d_grad_grad", + return KernelSignature("depthwise_conv2d_double_grad", {"Input", "Filter", "DOutput", "DDInput", "DDFilter"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search", - "fuse_relu_before_depthwise_conv"}, + "data_format"}, {"DInput", "DFilter", "DDOutput"}); } } // namespace phi +PD_REGISTER_BASE_KERNEL_NAME(depthwise_conv2d_grad_grad, + depthwise_conv2d_double_grad); + PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d, phi::DepthwiseConv2dOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad, diff --git a/paddle/phi/ops/compat/sync_batch_norm_sig.cc b/paddle/phi/ops/compat/sync_batch_norm_sig.cc index 9380751987ebc..067d1905cf377 100644 --- a/paddle/phi/ops/compat/sync_batch_norm_sig.cc +++ b/paddle/phi/ops/compat/sync_batch_norm_sig.cc @@ -19,14 +19,13 @@ namespace phi { KernelSignature SyncBatchNormOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("sync_batch_norm", - {"X", "Scale", "Bias", "Mean", "Variance"}, - {"momentum", + {"X", "Mean", "Variance", "Scale", "Bias"}, + {"is_test", + "momentum", "epsilon", "data_layout", - "is_test", "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, + "trainable_statistics"}, {"Y", "MeanOut", "VarianceOut", @@ -52,8 +51,7 @@ KernelSignature SyncBatchNormGradOpArgumentMapping( "data_layout", "is_test", "use_global_stats", - "trainable_statistics", - "fuse_with_relu"}, + "trainable_statistics"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); } diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 3231b18c8886e..0029197c6698a 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -1533,17 +1533,16 @@ def forward(self, input): if in_dygraph_mode(): batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm( input, - self.weight, - self.bias, self._mean, self._variance, + self.weight, + self.bias, + not self.training, self._momentum, self._epsilon, self._data_layout, - not self.training, self._use_global_stats, self._trainable_statistics, - False, ) return dygraph_utils._append_activation_in_dygraph( batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index f022f5a8e38a3..d1b2ff284fb8e 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -172,10 +172,6 @@ def _conv_nd( groups, dilation, data_format, - False, - -1, - False, - False, use_cudnn, ) if bias is not None: @@ -202,9 +198,6 @@ def _conv_nd( groups, dilation, data_format, - False, - -1, - False, ) if bias is not None: channel_dim = ( diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 25008f7e2dc42..b54b53c34260e 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -202,17 +202,16 @@ def batch_norm( if in_dygraph_mode(): batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm( x, - weight, - bias, running_mean, running_var, + weight, + bias, + not training, momentum, epsilon, data_format, - not training, use_global_stats, trainable_statistics, - False, ) return dygraph_utils._append_activation_in_dygraph( diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 8864237a2820a..e95cff7b16732 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -1180,15 +1180,14 @@ def forward(self, x): if in_dygraph_mode(): sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm_( x, - self.weight, - self.bias, self._mean, self._variance, + self.weight, + self.bias, + not self.training, self._momentum, self._epsilon, self._data_format, - not self.training, - False, False, False, ) diff --git a/python/paddle/sparse/nn/layer/norm.py b/python/paddle/sparse/nn/layer/norm.py index b3993a2947b00..38c90ee0387fd 100644 --- a/python/paddle/sparse/nn/layer/norm.py +++ b/python/paddle/sparse/nn/layer/norm.py @@ -140,17 +140,16 @@ def forward(self, input): if in_dynamic_mode(): batch_norm_out, _, _, _, _, _ = _C_ops.sparse_batch_norm( input, - self.weight, - self.bias, self._mean, self._variance, + self.weight, + self.bias, + not self.training, self._momentum, self._epsilon, data_format, - not self.training, self._use_global_stats, trainable_statistics, - False, ) return batch_norm_out else: @@ -324,15 +323,14 @@ def forward(self, x): self._check_data_format() sync_batch_norm_out, _, _, _, _, _ = _C_ops.sparse_sync_batch_norm_( x, - self.weight, - self.bias, self._mean, self._variance, + self.weight, + self.bias, + not self.training, self._momentum, self._epsilon, self._data_format, - not self.training, - False, False, False, ) From 246fb841c586d8369257fae71204c145bbf7db1d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 2 Nov 2022 01:26:47 -0500 Subject: [PATCH 85/91] Add storage properties into DenseTensor for supporting extra device properties (#47527) * add storage properties for npu * fix compile failed * fix api name mismatch * polish design --- paddle/phi/core/dense_tensor.cc | 30 +++++++ paddle/phi/core/dense_tensor.h | 47 +++++++++++ paddle/phi/core/dense_tensor_impl.cc | 2 + paddle/phi/core/storage_properties.h | 97 ++++++++++++++++++++++ paddle/phi/core/utils/type_registry.h | 1 + paddle/phi/tests/core/test_dense_tensor.cc | 52 ++++++++++++ 6 files changed, 229 insertions(+) create mode 100644 paddle/phi/core/storage_properties.h diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 02f0fbb895215..8a2d0e8a46bd4 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -53,6 +53,8 @@ DenseTensor::DenseTensor(const std::shared_ptr& holder, DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) { holder_ = other.holder_; + storage_properties_ = + std::move(CopyStorageProperties(other.storage_properties_)); inplace_version_counter_ = other.inplace_version_counter_; #ifdef PADDLE_WITH_MKLDNN @@ -64,6 +66,8 @@ DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) { DenseTensor& DenseTensor::operator=(const DenseTensor& other) { meta_ = other.meta(); holder_ = other.holder_; + storage_properties_ = + std::move(CopyStorageProperties(other.storage_properties_)); inplace_version_counter_ = other.inplace_version_counter_; #ifdef PADDLE_WITH_MKLDNN format_ = other.format_; @@ -75,6 +79,7 @@ DenseTensor& DenseTensor::operator=(const DenseTensor& other) { DenseTensor& DenseTensor::operator=(DenseTensor&& other) { meta_ = std::move(other.meta_); std::swap(holder_, other.holder_); + storage_properties_ = std::move(other.storage_properties_); std::swap(inplace_version_counter_, other.inplace_version_counter_); #ifdef PADDLE_WITH_MKLDNN format_ = other.format_; @@ -241,4 +246,29 @@ DATA_MEMBER_FUNC_INSTANTIATION(::phi::dtype::complex); #undef DATA_MEMBER_FUNC_INSTANTIATION +template +const DeviceT& DenseTensor::storage_properties() const { + PADDLE_ENFORCE_NOT_NULL( + storage_properties_, + phi::errors::PreconditionNotMet( + "The storage_properties of current DenseTensor is nullptr.")); + if (DeviceT::classof(storage_properties_.get())) { + return static_cast(*storage_properties_); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The actual type of storage_properties is inconsistent with the type " + "of the template parameter passed in.")); + } +} + +template const NPUStorageProperties& DenseTensor::storage_properties() const; +#ifdef PADDLE_WITH_MKLDNN +template const OneDNNStorageProperties& DenseTensor::storage_properties() const; +#endif + +void DenseTensor::set_storage_properties( + std::unique_ptr&& storage_properties) { + storage_properties_ = std::move(storage_properties); +} + } // namespace phi diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index abf242acdb22a..e0d620ac3a53e 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/storage_properties.h" #include "paddle/phi/core/stream.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" @@ -163,6 +164,16 @@ class DenseTensor : public TensorBase, void* data(); + /// \brief Returns the storage_properties of the tensor. + /// \return The storage_properties of the tensor. + template + const DeviceT& storage_properties() const; + + /// \brief Sets the storage_properties of the tensor. + /// \param storage_properties The storage_properties of the tensor. + void set_storage_properties( + std::unique_ptr&& storage_properties); + private: friend class DenseTensorUtils; @@ -170,6 +181,42 @@ class DenseTensor : public TensorBase, DenseTensorMeta meta_; std::shared_ptr holder_; + /** [ Why need StorageProperties? ] + * + * 1. Some hardware or third-party libraries add some additional storage + * properties on top of the description of the basic DenseTensor, such as + * memory desc of MKLDNN, storage_format and storage_layout of NPU, + * these members are necessary for optimal performance, but if the properties + * of each device are added to the DenseTensor with different macro isolation, + * the memory layout of the DenseTensor will become more fragmented. + * Under different compilation conditions, the member layout of the + * DenseTensor is very unstable, which may introduce bugs that are difficult + * to debug. + * + * 2. If the layout of DenseTensor is very different from the framework + * itself, it is recommended to directly inherit TensorBase to implement + * SpatialTensor. + * + * TODO(chenweihang): merge the dnnl::memory::desc and + * dnnl::memory::format_tag into StorageProperties, dnnl::memory::desc is a + * type that takes up a lot of space, original tensor members' size: + * + * DenseTensor size: 880 + * -------- ordered members --------: + * DenseTensorMeta size: 128 + * - is_scalar_ size: 1 + * - DDim size: 80 + * - DataType size: 4 + * - DataLayout size: 4 + * - LoD size: 24 + * - offset size: 8 + * std::shared_ptr size: 16 + * std::shared_ptr size: 16 // need to be moved + * dnnl::memory::format_tag size: 4 // need to be moved + * dnnl::memory::desc size: 696 // need to be moved + */ + std::unique_ptr storage_properties_{nullptr}; + public: /* Temporarily put InplaceVersion inside DenseTensor. Will move to AutogradMeta as soon as we switch to Eager Dygraph. diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index 36c6c6e96ff90..b78a9dc135e48 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -370,6 +370,8 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { meta_.dtype = src.meta_.dtype; meta_.layout = src.meta_.layout; meta_.offset = src.meta_.offset; + storage_properties_ = + std::move(CopyStorageProperties(src.storage_properties_)); #ifdef PADDLE_WITH_MKLDNN format_ = src.format_; mem_desc_ = src.mem_desc_; diff --git a/paddle/phi/core/storage_properties.h b/paddle/phi/core/storage_properties.h new file mode 100644 index 0000000000000..908abd8d9d35d --- /dev/null +++ b/paddle/phi/core/storage_properties.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/phi/core/utils/type_registry.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "dnnl.hpp" // NOLINT +#endif + +namespace phi { + +struct StorageProperties { + public: + virtual ~StorageProperties() = default; + TypeInfo type_info() const { return type_info_; } + + private: + template + friend class TypeInfoTraits; + TypeInfo type_info_{ + TypeInfo::kUnknownType}; +}; + +struct NPUStorageProperties + : public StorageProperties, + public TypeInfoTraits { + virtual ~NPUStorageProperties() = default; + static const char* name() { return "NPUStorageProperties"; } + + int64_t storage_format; + int64_t storage_layout; +}; + +// Add OneDNNStorageProperties firstly for unittest covergae +#ifdef PADDLE_WITH_MKLDNN +struct OneDNNStorageProperties + : public StorageProperties, + public TypeInfoTraits { + virtual ~OneDNNStorageProperties() = default; + static const char* name() { return "OneDNNStorageProperties"; } + + /** + * @brief the detail format of memory block which have layout as ONEDNN + * + * @note ONEDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a ONEDNN memory block, layout will be set as + * DataLayout::ONEDNN meanwhile detail memory format will be kept in + * this field. + */ + dnnl::memory::format_tag format = dnnl::memory::format_tag::undef; + + /// \brief memory descriptor of tensor which have layout set as ONEDNN + dnnl::memory::desc mem_desc; +}; +#endif + +static std::unique_ptr CopyStorageProperties( + const std::unique_ptr& sp) { + if (sp) { + if (NPUStorageProperties::classof(sp.get())) { + auto result = std::make_unique(); + result->storage_format = + static_cast(sp.get())->storage_format; + result->storage_layout = + static_cast(sp.get())->storage_layout; + return result; +#ifdef PADDLE_WITH_MKLDNN + } else if (OneDNNStorageProperties::classof(sp.get())) { + auto result = std::make_unique(); + result->format = static_cast(sp.get())->format; + result->mem_desc = + static_cast(sp.get())->mem_desc; + return result; +#endif + } else { + return nullptr; + } + } + return nullptr; +} + +} // namespace phi diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h index c233e1f743b21..ed1c9216e99e9 100644 --- a/paddle/phi/core/utils/type_registry.h +++ b/paddle/phi/core/utils/type_registry.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/phi/core/utils/type_info.h" diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc index f6a3e3fa41348..b997d8f1e76ae 100644 --- a/paddle/phi/tests/core/test_dense_tensor.cc +++ b/paddle/phi/tests/core/test_dense_tensor.cc @@ -129,5 +129,57 @@ TEST(dense_tensor, shallow_copy) { CHECK(tensor_0.meta() == tensor_1.meta()); } +struct TestStorageProperties + : public StorageProperties, + public TypeInfoTraits { + virtual ~TestStorageProperties() = default; + static const char* name() { return "TestStorageProperties"; } +}; + +TEST(dense_tensor, storage_properties) { + const DataType dtype{DataType::FLOAT32}; + const DDim dims({1, 2}); + DenseTensorMeta meta(dtype, dims); + + auto fancy_allocator = std::unique_ptr(new FancyAllocator); + DenseTensor tensor(fancy_allocator.get(), meta); + + // test no storage properties + bool caught_exception = false; + try { + tensor.storage_properties(); + } catch (phi::enforce::EnforceNotMet& error) { + caught_exception = true; + } + EXPECT_TRUE(caught_exception); + + // test custom device storage properties + auto npu_properties = std::make_unique(); + npu_properties->storage_format = 1; + npu_properties->storage_layout = 2; + tensor.set_storage_properties(std::move(npu_properties)); + auto get_npu_properties = tensor.storage_properties(); + CHECK_EQ(get_npu_properties.storage_format, 1); + CHECK_EQ(get_npu_properties.storage_layout, 2); + + // test error type storage properties +#ifdef PADDLE_WITH_MKLDNN + caught_exception = false; + try { + tensor.storage_properties(); + } catch (phi::enforce::EnforceNotMet& error) { + caught_exception = true; + } + EXPECT_TRUE(caught_exception); +#endif + + // test copy storage properties + auto cp_tensor = tensor; + auto get_cp_npu_properties = + cp_tensor.storage_properties(); + CHECK_EQ(get_cp_npu_properties.storage_format, 1); + CHECK_EQ(get_cp_npu_properties.storage_layout, 2); +} + } // namespace tests } // namespace phi From 2d058cce52edbe72c855aabd90fb286da0890baf Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 2 Nov 2022 01:28:23 -0500 Subject: [PATCH 86/91] Add phi core file into ci checking list (#47564) * add phi core file into ci list, test=document_fix * remove repated file, test=document_fix --- tools/check_file_diff_approvals.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index d2419e7722092..1ebe0ebb71f8d 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -69,6 +69,18 @@ API_FILES=("CMakeLists.txt" "paddle/fluid/eager/autograd_meta.h" "paddle/fluid/eager/backward.cc" "paddle/fluid/eager/backward.h" + "paddle/phi/api/include/tensor.h" + "paddle/phi/core/tensor_base.h" + "paddle/phi/core/dense_tensor.h" + "paddle/phi/core/meta_tensor.h" + "paddle/phi/core/tensor_meta.h" + "paddle/phi/core/attribute.h" + "paddle/phi/core/device_context.h" + "paddle/phi/core/kernel_utils.h" + "paddle/phi/core/kernel_registry.h" + "paddle/phi/core/kernel_factory.h" + "paddle/phi/core/kernel_context.h" + "paddle/phi/core/infermeta_utils.h" ) approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` @@ -191,6 +203,9 @@ for API_FILE in ${API_FILES[*]}; do elif [ "${API_FILE}" == "python/paddle/fluid/dygraph/layers.py" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_node_info.h" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_node_info.cc" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_tensor_holder.h" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_tensor_holder.cc" ] || [ "${API_FILE}" == "paddle/fluid/eager/tensor_wrapper.h" ] || [ "${API_FILE}" == "paddle/fluid/eager/autograd_meta.cc"] || [ "${API_FILE}" == "paddle/fluid/eager/autograd_meta.h"] || [ "${API_FILE}" == "paddle/fluid/eager/backward.cc"] || [ "${API_FILE}" == "paddle/fluid/eager/backward.h"]; then echo_line="You must have one RD (JiabinYang,chenwhql,phlrain) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n" check_approval JiabinYang chenwhql phlrain + elif [ "${API_FILE}" == "paddle/phi/api/include/tensor.h" ] || [ "${API_FILE}" == "paddle/phi/core/tensor_base.h" ] || [ "${API_FILE}" == "paddle/phi/core/dense_tensor.h" ] || [ "${API_FILE}" == "paddle/phi/core/meta_tensor.h" ] || [ "${API_FILE}" == "paddle/phi/core/tensor_meta.h" ] || [ "${API_FILE}" == "paddle/phi/core/attribute.h" ] || [ "${API_FILE}" == "paddle/phi/core/device_context.h" ] || [ "${API_FILE}" == "paddle/phi/core/kernel_utils.h" ] || [ "${API_FILE}" == "paddle/phi/core/kernel_registry.h" ] || [ "${API_FILE}" == "paddle/phi/core/kernel_factory.h" ] || [ "${API_FILE}" == "paddle/phi/core/kernel_context.h" ] || [ "${API_FILE}" == "paddle/phi/core/infermeta_utils.h" ]; then + echo_line="You must have one RD (chenwhql, phlrain, zyfncg, YuanRisheng) approval for changing ${API_FILE} , which manages the underlying code for PaddlePaddle PHI Library.\n" + check_approval chenwhql phlrain zyfncg YuanRisheng else echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1,qili93) approval for ${API_FILE}, which manages the underlying code for fluid.\n" check_approval 1 46782768 12538138 6836917 22561442 6888866 16605440 From 6f7a80c34ea3f5fb8e0c063c195cbc04bdcc531d Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 2 Nov 2022 14:58:47 +0800 Subject: [PATCH 87/91] fix amax/amin/max/min write overflow (#47570) --- paddle/phi/kernels/funcs/reduce_functor.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index 34032e153c049..e0e7ec3d403f1 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -177,7 +177,8 @@ struct MaxOrMinGradFunctor { auto zeros = dx->constant(0); // If there are multiple minimum or maximum elements, the subgradient of // each is the set [0, 1], and we pass gradient to all of them here. - dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + dx->device(place) = dy->broadcast(dim).reshape(x->dimensions()) * + equals.select(ones, zeros); } }; @@ -259,7 +260,8 @@ struct AMaxOrAMinGradFunctor { auto equal_number = mask.sum() .reshape(Eigen::array({1})) .broadcast(Eigen::array({size})); - dx->device(place) = dy->broadcast(dim) * mask / equal_number; + dx->device(place) = + dy->broadcast(dim).reshape(x->dimensions()) * mask / equal_number; return; } From 5ed487bfc22c3c88a42f6f7690bf0ddca76ec6e7 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 2 Nov 2022 15:26:09 +0800 Subject: [PATCH 88/91] Dispatch computation OPs before communication in standalone executor (#47471) * Dispath computation OPs before communication in standalone executor * Update code * Fix CI errors --- .../interpreter/interpreter_util.cc | 7 +++- .../interpreter/interpreter_util.h | 2 + .../framework/new_executor/interpretercore.cc | 37 +++++++++++++------ .../framework/new_executor/interpretercore.h | 2 +- .../new_executor/new_executor_defs.cc | 8 +++- .../new_executor/new_executor_defs.h | 8 +++- 6 files changed, 47 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 816331e3fa549..6c002d06b5b19 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -129,14 +129,13 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type, } } -bool IsCommunicationOp(const Instruction& instr) { +bool IsCommunicationOp(const std::string& op_name) { const std::set special_comm_op_set = { "send", "recv", "send_v2", "recv_v2", }; - const std::string& op_name = instr.OpBase()->Type(); const std::string communication_op_prefix = "c_"; if (op_name.find(communication_op_prefix) != std::string::npos || special_comm_op_set.count(op_name)) { @@ -145,6 +144,10 @@ bool IsCommunicationOp(const Instruction& instr) { return false; } +bool IsCommunicationOp(const Instruction& instr) { + return IsCommunicationOp(instr.OpBase()->Type()); +} + bool IsCpuOp(const Instruction& instr) { return platform::is_cpu_place(instr.DeviceContext().GetPlace()); } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index b842d3acfde6d..d6652d2654160 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -65,6 +65,8 @@ class AsyncWorkQueue { std::unique_ptr queue_group_; }; +bool IsCommunicationOp(const std::string& op_name); + bool IsCommunicationOp(const Instruction& instr); bool IsCpuOp(const Instruction& instr); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 230e333458dd4..825c4e14c4489 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -528,7 +528,12 @@ void InterpreterCore::Convert( for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { auto& op_func_node = nodes[op_idx]; auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node); - vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); + Priority priority = + interpreter::IsCommunicationOp(op_func_node.operator_base_->Type()) + ? Priority::kLowest + : Priority::kNormal; + vec_instruction_.emplace_back( + op_idx, std::move(op_func_node), *dev_ctx_, priority); } BuildOperatorDependences(); @@ -835,7 +840,7 @@ void InterpreterCore::ExecuteInstructionList( } void InterpreterCore::RunNextInstructions( - const Instruction& instr, std::queue* reserved_next_ops) { + const Instruction& instr, std::deque* reserved_next_ops) { platform::RecordEvent record( "RunNextInstructions", platform::TracerEventType::UserDefined, 10); auto& next_instr = instr.NextInstructions(); @@ -848,7 +853,7 @@ void InterpreterCore::RunNextInstructions( if (instr.KernelType() == OpFuncType::kQueueAsync) { // move all sync_ops into other threads - for (auto next_id : next_instr.SyncRunIds()) { + for (size_t next_id : next_instr.SyncRunIds()) { if (IsReady(next_id)) { async_work_queue_->AddTask( vec_instruction_[next_id].KernelType(), @@ -856,14 +861,22 @@ void InterpreterCore::RunNextInstructions( } } // keep all async_ops running in current thread - for (auto next_id : next_instr.DirectRunIds()) { + for (size_t next_id : next_instr.DirectRunIds()) { if (IsReady(next_id)) { - reserved_next_ops->push(next_id); + if (vec_instruction_[next_id].GetPriority() == Priority::kLowest) { + reserved_next_ops->push_back(next_id); + } else { + reserved_next_ops->push_front(next_id); + } } } - for (auto next_id : next_instr.EventRunIds()) { + for (size_t next_id : next_instr.EventRunIds()) { if (IsReady(next_id)) { - reserved_next_ops->push(next_id); + if (vec_instruction_[next_id].GetPriority() == Priority::kLowest) { + reserved_next_ops->push_back(next_id); + } else { + reserved_next_ops->push_front(next_id); + } } } } else { @@ -895,16 +908,18 @@ void InterpreterCore::RunNextInstructions( [this, next_id] { RunInstructionAsync(next_id); }); } } - if (first_op != -1) reserved_next_ops->push(first_op); + if (first_op != -1) { + reserved_next_ops->push_front(first_op); + } } } void InterpreterCore::RunInstructionAsync(size_t instr_id) { - std::queue ready_ops; - ready_ops.push(instr_id); + std::deque ready_ops; + ready_ops.push_back(instr_id); while (!ready_ops.empty()) { instr_id = ready_ops.front(); - ready_ops.pop(); + ready_ops.pop_front(); auto& instr_node = vec_instruction_.at(instr_id); VLOG(5) << __func__ << " OP id:" << instr_node.Id() << " name:" << instr_node.OpBase()->Type() << " type:" diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index ff89f5ed731de..4cf5053448703 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -92,7 +92,7 @@ class InterpreterCore { void RunInstructionAsync(size_t instr_id); void RunInstruction(const Instruction& instr_node); void RunNextInstructions(const Instruction& instr_id, - std::queue* reserved_next_ops); + std::deque* reserved_next_ops); // only used when program contains no feed op void Prepare(const std::vector& feed_names, const std::vector& feed_tensors, diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index 02be9f47ecf3e..08a4a486173f7 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -673,8 +673,12 @@ void VariableScope::CheckExist(const std::string& name) const { Instruction::Instruction(size_t id, OpFuncNode&& op_func_node, - const platform::DeviceContext& dev_ctx) - : id_(id), op_func_node_(op_func_node), dev_ctx_(dev_ctx) { + const platform::DeviceContext& dev_ctx, + const Priority priority) + : id_(id), + op_func_node_(op_func_node), + dev_ctx_(dev_ctx), + priority_(priority) { PADDLE_ENFORCE_GE(id, 0, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 6735e891230d7..f1ede0974bb0c 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -40,6 +40,8 @@ constexpr const char* kDefaultStream = "DefaultStream"; constexpr const char* kD2HStream = "D2HStream"; constexpr const char* kH2DStream = "H2DStream"; +enum class Priority { kLowest, kNormal }; + class InterpretercoreInferShapeContext : public InferShapeContext { public: InterpretercoreInferShapeContext(const OperatorBase& op, @@ -300,7 +302,8 @@ class Instruction { public: Instruction(size_t id, OpFuncNode&& op_func_node, - const platform::DeviceContext& dev_ctx); + const platform::DeviceContext& dev_ctx, + const Priority priority); size_t Id() const; @@ -362,10 +365,13 @@ class Instruction { std::shared_ptr event, platform::DeviceType waiter_type); + Priority GetPriority() const { return priority_; } + private: size_t id_; OpFuncNode op_func_node_; const platform::DeviceContext& dev_ctx_; // not owned + const Priority priority_; std::shared_ptr runtime_ctx_; std::shared_ptr infershape_ctx_; From dac1087e3ca74c790eab151b3cdc6f378036d763 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Wed, 2 Nov 2022 16:08:26 +0800 Subject: [PATCH 89/91] rename fw_bw func name of interleave pp (#47571) --- .../distributed/fleet/meta_parallel/pipeline_parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index b7d1eb39c0174..cbdee2c875b9d 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -534,7 +534,7 @@ def _backward_step_helper(self, micro_step): return input_tensor_grad - def interleave_pipeline( + def forward_backward_pipeline( self, data, scaler, forward_only=False, compute_loss=True ): # use interleave scheduling strategy. @@ -763,7 +763,7 @@ def interleave_pipeline( def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): data = self._prepare_training(data, optimizer, lr_scheduler) # interleave scheduler for pipeline parallel - train_loss = self.interleave_pipeline(data, scaler) + train_loss = self.forward_backward_pipeline(data, scaler) # optimizer with paddle.amp.auto_cast(enable=False): @@ -778,4 +778,4 @@ def eval_batch(self, data, compute_loss=False): self._layers.eval() self._compute_loss = compute_loss - return self.interleave_pipeline(data, None, forward_only=True) + return self.forward_backward_pipeline(data, None, forward_only=True) From b045fdfb150b0e18b547801d1f10f62da9438946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=81=E4=B8=80?= Date: Wed, 2 Nov 2022 16:52:46 +0800 Subject: [PATCH 90/91] Logsigmoid and Tanhshrink ops convert to trt (#47322) --- paddle/fluid/framework/ir/is_test_pass.cc | 25 ++-- .../fluid/inference/api/analysis_predictor.cc | 3 + .../inference/tensorrt/convert/CMakeLists.txt | 3 + .../inference/tensorrt/convert/celu_op.cc | 102 +++++++++++++ .../tensorrt/convert/logsigmoid_op.cc | 78 ++++++++++ .../tensorrt/convert/tanhshrink_op.cc | 79 ++++++++++ .../tensorrt/convert/test_celu_op.cc | 48 ++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 27 ++-- .../ir/inference/test_trt_convert_celu.py | 140 ++++++++++++++++++ .../inference/test_trt_convert_logsigmoid.py | 137 +++++++++++++++++ .../ir/inference/test_trt_convert_silu.py | 53 ++++--- .../inference/test_trt_convert_tanhshrink.py | 137 +++++++++++++++++ 12 files changed, 783 insertions(+), 49 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/celu_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/logsigmoid_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/tanhshrink_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_celu_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_celu.py create mode 100755 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_logsigmoid.py create mode 100755 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tanhshrink.py diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 47a3e46d076c3..38137d18db6bf 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -25,18 +25,19 @@ class Graph; void IsTestPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it " "for activations and pooling."; - auto op_list = {"pool2d", "sigmoid", "logsigmoid", - "softshrink", "exp", "brelu", - "pow", "leaky_relu", "stanh", - "relu", "tanh", "tanh_shrink", - "sqrt", "abs", "ceil", - "elu", "floor", "cos", - "sin", "round", "reciprocal", - "hard_shrink", "hard_sigmoid", "relu6", - "soft_relu", "swish", "thresholded_relu", - "log", "square", "softplus", - "softsign", "silu", "mish", - "gumbel_softmax"}; + auto op_list = {"pool2d", "sigmoid", "logsigmoid", + "softshrink", "exp", "brelu", + "pow", "leaky_relu", "stanh", + "relu", "tanh", "tanh_shrink", + "sqrt", "abs", "ceil", + "elu", "floor", "cos", + "sin", "round", "reciprocal", + "hard_shrink", "hard_sigmoid", "relu6", + "soft_relu", "swish", "thresholded_relu", + "log", "square", "softplus", + "softsign", "silu", "gumbel_softmax", + "mish", "celu", "tanhshrink", + "logsigmoid"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 280427cb4c8f3..345e8e64011cd 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2318,11 +2318,14 @@ USE_TRT_CONVERTER(sum) USE_TRT_CONVERTER(shape) USE_TRT_CONVERTER(fill_constant) USE_TRT_CONVERTER(fused_token_prune) +USE_TRT_CONVERTER(celu) USE_TRT_CONVERTER(layernorm_shift_partition) USE_TRT_CONVERTER(preln_layernorm_shift_partition) USE_TRT_CONVERTER(merge_layernorm) USE_TRT_CONVERTER(generic_plugin_creater) USE_TRT_CONVERTER(custom_plugin_creater) +USE_TRT_CONVERTER(tanh_shrink) +USE_TRT_CONVERTER(logsigmoid) USE_TRT_CONVERTER(lookup_table) USE_TRT_CONVERTER(expand_v2) #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index a6c0a42de37ab..72e2e1c7f0c95 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -76,7 +76,10 @@ list( shape_op.cc fill_constant_op.cc fused_token_prune_op.cc + celu_op.cc layernorm_shift_partition_op.cc + tanhshrink_op.cc + logsigmoid_op.cc preln_layernorm_shift_partition_op.cc merge_layernorm_op.cc generic_and_custom_plugin_creater.cc diff --git a/paddle/fluid/inference/tensorrt/convert/celu_op.cc b/paddle/fluid/inference/tensorrt/convert/celu_op.cc new file mode 100644 index 0000000000000..c357b95a2b0c8 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/celu_op.cc @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace nvinfer1 { +class ILayer; +} // namespace nvinfer1 +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class CeluOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(4) << "convert fluid celu op to tensorrt layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE_EQ(input_num, + 1, + platform::errors::InvalidArgument( + "The input X's size must equal to 1 in TRT celu op." + " But received X's size %d.", + input_num)); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE_EQ( + output_num, + 1UL, + platform::errors::InvalidArgument( + "The output Out's size must equal to 1 in TRT celu op. " + "But received Out's size %u.", + output_num)); + // Get attrs + float alpha = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha")); + + nvinfer1::ILayer* layer = nullptr; + + int32_t rank = input->getDimensions().nbDims; + nvinfer1::Dims constant_shape; + constant_shape.nbDims = rank; + std::fill(constant_shape.d, constant_shape.d + rank, 1); + std::vector weight_alpha_data{alpha}; + std::vector weight_zero_data{0.f}; + std::vector weight_one_data{1.f}; + auto* alpha_data = + AddConstantLayer(weight_alpha_data.data(), constant_shape); + auto* constant_zero_data = + AddConstantLayer(weight_zero_data.data(), constant_shape); + auto* constant_one_data = + AddConstantLayer(weight_one_data.data(), constant_shape); + + auto* input_div_with_alpha = Div(input, alpha_data); + auto* input_exp = TRT_ENGINE_ADD_LAYER( + engine_, Unary, *input_div_with_alpha, nvinfer1::UnaryOperation::kEXP); + auto* input_sub_with_one = Sub(input_exp->getOutput(0), constant_one_data); + auto* input_prod_with_alpha = Prod(input_sub_with_one, alpha_data); + auto* min_input = Min(input_prod_with_alpha, constant_zero_data); + auto* relu = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *input, nvinfer1::ActivationType::kRELU); + layer = TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *relu->getOutput(0), + *min_input, + nvinfer1::ElementWiseOperation::kSUM); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "celu", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(celu, CeluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/logsigmoid_op.cc b/paddle/fluid/inference/tensorrt/convert/logsigmoid_op.cc new file mode 100644 index 0000000000000..74f8d21eab702 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/logsigmoid_op.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace nvinfer1 { +class ILayer; +} // namespace nvinfer1 +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class LogSigmoidOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(4) << "convert fluid LogSigmoid op to tensorrt layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE_EQ( + input_num, + 1, + platform::errors::InvalidArgument( + "The input X's size must equal to 1 in TRT LogSigmoid op." + " But received X's size %d.", + input_num)); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE_EQ( + output_num, + 1UL, + platform::errors::InvalidArgument( + "The output Out's size must equal to 1 in TRT LogSigmoid op. " + "But received Out's size %u.", + output_num)); + + nvinfer1::ILayer* layer = nullptr; + auto* sigmoid = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *input, nvinfer1::ActivationType::kSIGMOID); + layer = TRT_ENGINE_ADD_LAYER(engine_, + Unary, + *(sigmoid->getOutput(0)), + nvinfer1::UnaryOperation::kLOG); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "logsigmoid", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(logsigmoid, LogSigmoidOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/tanhshrink_op.cc b/paddle/fluid/inference/tensorrt/convert/tanhshrink_op.cc new file mode 100644 index 0000000000000..f31b2a50655e7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/tanhshrink_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace nvinfer1 { +class ILayer; +} // namespace nvinfer1 +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TanhshrinkOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(4) << "convert fluid Tanhshrink op to tensorrt layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE_EQ( + input_num, + 1, + platform::errors::InvalidArgument( + "The input X's size must equal to 1 in TRT Tanhshrink op." + " But received X's size %d.", + input_num)); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE_EQ( + output_num, + 1UL, + platform::errors::InvalidArgument( + "The output Out's size must equal to 1 in TRT Tanhshrink op. " + "But received Out's size %u.", + output_num)); + + nvinfer1::ILayer* layer = nullptr; + auto* tanh = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *input, nvinfer1::ActivationType::kTANH); + layer = TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *input, + *(tanh->getOutput(0)), + nvinfer1::ElementWiseOperation::kSUB); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "tanh_shrink", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(tanh_shrink, TanhshrinkOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_celu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_celu_op.cc new file mode 100644 index 0000000000000..73b89ebb75807 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_celu_op.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(celu_op, test_celu) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("celu_input", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("celu_out", nvinfer1::Dims3(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("celu"); + desc.SetInput("X", {"celu_input"}); + desc.SetOutput("Out", {"celu_out"}); + + desc.SetAttr("alpha", 2.0f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(celu); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 3e6f5779c6fa8..6680b16a356b9 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -77,16 +77,17 @@ struct SimpleOpTypeSetTeller : public Teller { desc.HasAttr("skip_quant")) return false; std::unordered_set act_op_list = { - "relu", "relu6", "sigmoid", - "elu", "selu", "softsign", - "softplus", "stanh", "thresholded_relu", - "exp", "log", "sqrt", - "abs", "sin", "cos", - "tan", "tanh", "sinh", - "cosh", "asin", "acos", - "atan", "asinh", "atanh", - "ceil", "floor", "erf", - "silu"}; + "relu", "relu6", "sigmoid", + "elu", "selu", "softsign", + "softplus", "stanh", "thresholded_relu", + "exp", "log", "sqrt", + "abs", "sin", "cos", + "tan", "tanh", "sinh", + "cosh", "asin", "acos", + "atan", "asinh", "atanh", + "ceil", "floor", "erf", + "silu", "celu", "tanh_shrink", + "logsigmoid"}; if (act_op_list.find(op_type) != act_op_list.end()) { auto* block = desc.Block(); if (block == nullptr) { @@ -2212,6 +2213,7 @@ struct SimpleOpTypeSetTeller : public Teller { "shuffle_channel", "swish", "silu", + "celu", "split", "instance_norm", "gelu", @@ -2268,6 +2270,8 @@ struct SimpleOpTypeSetTeller : public Teller { "squeeze2", "unsqueeze2", "layernorm_shift_partition", + "tanh_shrink", + "logsigmoid", "preln_layernorm_shift_partition", "lookup_table", "lookup_table_v2", @@ -2330,6 +2334,7 @@ struct SimpleOpTypeSetTeller : public Teller { "shuffle_channel", "swish", "silu", + "celu", "split", "instance_norm", "gelu", @@ -2387,6 +2392,8 @@ struct SimpleOpTypeSetTeller : public Teller { "unsqueeze2", "fused_token_prune", "layernorm_shift_partition", + "tanh_shrink", + "logsigmoid", "preln_layernorm_shift_partition", "merge_layernorm", "lookup_table", diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_celu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_celu.py new file mode 100644 index 0000000000000..58a8fecfdeb96 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_celu.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import List, Dict, Any +import unittest + + +class TrtConvertCeluTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input1(dims, attrs: List[Dict[str, Any]]): + if dims == 1: + return np.ones([3]).astype(np.float32) + elif dims == 2: + return np.ones([3, 64]).astype(np.float32) + elif dims == 3: + return np.ones([3, 64, 64]).astype(np.float32) + else: + return np.ones([1, 3, 64, 64]).astype(np.float32) + + for dims in [1, 2, 3, 4]: + for alpha in [1.0, 2.0, 3.0]: + self.dims = dims + + dics = [{"alpha": alpha}] + + ops_config = [ + { + "op_type": "celu", + "op_inputs": { + "X": ["input_data"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, dims, dics) + ) + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [128]} + self.dynamic_shape.opt_input_shape = {"input_data": [64]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} + self.dynamic_shape.max_input_shape = { + "input_data": [10, 64, 64] + } + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 64] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), (1e-3, 1e-3) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), (1e-3, 1e-3) + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_logsigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_logsigmoid.py new file mode 100755 index 0000000000000..f9ba462e669cd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_logsigmoid.py @@ -0,0 +1,137 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import List, Dict, Any +import unittest + + +class TrtConvertLogSigmoidTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input1(dims, attrs: List[Dict[str, Any]]): + if dims == 1: + return np.ones([3]).astype(np.float32) + elif dims == 2: + return np.ones([3, 64]).astype(np.float32) + elif dims == 3: + return np.ones([3, 64, 64]).astype(np.float32) + else: + return np.ones([1, 3, 64, 64]).astype(np.float32) + + for dims in [1, 2, 3, 4]: + self.dims = dims + + ops_config = [ + { + "op_type": "logsigmoid", + "op_inputs": { + "X": ["input_data"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": {}, + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, dims, {}) + ) + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [128]} + self.dynamic_shape.opt_input_shape = {"input_data": [64]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} + self.dynamic_shape.max_input_shape = { + "input_data": [10, 64, 64] + } + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 64] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), (1e-3, 1e-3) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), (1e-3, 1e-3) + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_silu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_silu.py index 73df326fb01ad..9ed3720a79f48 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_silu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_silu.py @@ -21,7 +21,7 @@ import unittest -class TrtConvertSwishTest(TrtLayerAutoScanTest): +class TrtConvertSiluTest(TrtLayerAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: return True @@ -37,33 +37,32 @@ def generate_input1(dims, attrs: List[Dict[str, Any]]): return np.ones([1, 3, 64, 64]).astype(np.float32) for dims in [1, 2, 3, 4]: - for beta in [1.0, 2.0, 3.0]: - self.dims = dims - - ops_config = [ - { - "op_type": "silu", - "op_inputs": { - "X": ["input_data"], - }, - "op_outputs": {"Out": ["output_data"]}, - "op_attrs": {}, - } - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data": TensorConfig( - data_gen=partial(generate_input1, dims, {}) - ) - }, - outputs=["output_data"], - ) + self.dims = dims - yield program_config + ops_config = [ + { + "op_type": "silu", + "op_inputs": { + "X": ["input_data"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": {}, + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, dims, {}) + ) + }, + outputs=["output_data"], + ) + + yield program_config def sample_predictor_configs( self, program_config diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tanhshrink.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tanhshrink.py new file mode 100755 index 0000000000000..9fbdbb89f2398 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tanhshrink.py @@ -0,0 +1,137 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import List, Dict, Any +import unittest + + +class TrtConvertTanhshrinkTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input1(dims, attrs: List[Dict[str, Any]]): + if dims == 1: + return np.ones([3]).astype(np.float32) + elif dims == 2: + return np.ones([3, 64]).astype(np.float32) + elif dims == 3: + return np.ones([3, 64, 64]).astype(np.float32) + else: + return np.ones([1, 3, 64, 64]).astype(np.float32) + + for dims in [1, 2, 3, 4]: + self.dims = dims + + ops_config = [ + { + "op_type": "tanh_shrink", + "op_inputs": { + "X": ["input_data"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": {}, + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, dims, {}) + ) + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [128]} + self.dynamic_shape.opt_input_shape = {"input_data": [64]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} + self.dynamic_shape.max_input_shape = { + "input_data": [10, 64, 64] + } + self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 64] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), (1e-3, 1e-3) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), (1e-3, 1e-3) + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() From c79ae02b3bcf553d073759083a93063e7d88b8fc Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 2 Nov 2022 17:06:23 +0800 Subject: [PATCH 91/91] add cuda117 dockerfile (#47412) * add cuda117 dockerfile; test=cuda117 * notest;test=cuda117 * test=cuda117 * test=document_fix --- tools/dockerfile/ci_dockerfile.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh index fbc21ec955d58..b793f31004d92 100644 --- a/tools/dockerfile/ci_dockerfile.sh +++ b/tools/dockerfile/ci_dockerfile.sh @@ -145,12 +145,33 @@ function make_ce_framework_dockcerfile(){ sed -i 's#python setup.py install#python3.7 setup.py install#g' ${dockerfile_name} } +function make_unbuntu18_cu117_dockerfile(){ + dockerfile_name="Dockerfile.cuda117_cudnn8_gcc82_ubuntu18_coverage" + sed "s##nvidia/cuda:11.7.0-cudnn8-devel-ubuntu18.04#g" ./Dockerfile.ubuntu18 >${dockerfile_name} + sed -i "s##ENV LD_LIBRARY_PATH=/usr/local/cuda-11.7/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name} + sed -i 's###g' ${dockerfile_name} + sed -i "7i ENV TZ=Asia/Beijing" ${dockerfile_name} + sed -i "8i RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone" ${dockerfile_name} + sed -i "27i RUN apt-get update && apt-get install -y liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev libsndfile1" ${dockerfile_name} + dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}') + sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \ + tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext pigz zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \ + tar -xvf git-2.17.1.tar.gz \&\& \ + cd git-2.17.1 \&\& \ + ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \ + make -j8 \&\& make install " ${dockerfile_name} + sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel distro \&\& pip3.7 install PyGithub \&\& pip3.8 install distro" ${dockerfile_name} + sed -i 's# && rm /etc/apt/sources.list.d/nvidia-ml.list##g' ${dockerfile_name} +} + function main() { make_ubuntu_dockerfile make_ubuntu_trt7_dockerfile make_centos_dockerfile make_cinn_dockerfile make_ce_framework_dockcerfile + make_unbuntu18_cu117_dockerfile } main "$@"