Cherrypick NV fixes to release/2.4 (#48263)

* Reduce squeeze2_matmul_fuse_pass, flattent tests time (#47098) * Add missing fp32 config and reduce the testing combination * Reduce trt matmul pass test max examples * Loose TRT fp16 tests tolerance (#47100) * Loose TRT half test tolerance to 1e-3 (#47101) * Loose TRT half test tolerance to 1e-3 (#47106) * Update distributed_strategy.proto (#46531) * Close popen pipe after used (#47053) * Add launch_bounds (#47285) * Fix TRT UT failures (#47488) * Format cherry-picked commits * CudnnNormConvolution is no longer supported on NVIDIA Hopper GPUs (#48203) * Skip tests that use fused_ops on H100 * Add error message to FusedOps on H100 Co-authored-by: Shijie <[email protected]> Co-authored-by: Leo Chen <[email protected]> Co-authored-by: Tian Zheng <[email protected]>
PaddlePaddle · Nov 28, 2022 · 7a0b862 · 7a0b862
1 parent a2f61fe
commit 7a0b862
Show file tree

Hide file tree

Showing 53 changed files with 3,129 additions and 2,744 deletions.
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
@@ -123,6 +123,7 @@ message BuildStrategy {
   optional bool allow_cuda_graph_capture = 14 [ default = false ];
   optional int32 reduce_strategy = 15 [ default = 0 ];
   optional bool fuse_gemm_epilogue = 16 [ default = false ];
+  optional string debug_graphviz_path = 17;
 }
 
 message ExecutionStrategy {

diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -45,6 +45,14 @@ struct NormConvolutionArgs {
            int stride,
            int dilation,
            int group) {
+    PADDLE_ENFORCE_LT(
+        ctx.GetComputeCapability(),
+        90,
+        phi::errors::PreconditionNotMet(
+            "Expect compute compatiblity to be less than 90, but got %d. "
+            "CUDNN FusedOps is no longer available on H100 and later "
+            "devices.",
+            ctx.GetComputeCapability()));
     PADDLE_ENFORCE_EQ(
         input_shape.size(),
         4U,

diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);

diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -256,17 +256,19 @@ template <typename T,
           int BlockSizeX,
           int BlockSizeY,
           int VecSize,
-          typename Functor>
-__global__ void FusedDropoutActBiasGrad(Functor act_grad,
-                                        const T *dout,
-                                        const MaskType *mask,
-                                        const T *src,
-                                        const T *bias,
-                                        const T factor,
-                                        const int64_t rows,
-                                        const int64_t cols,
-                                        T *dx,
-                                        T *dbias) {
+          typename Functor,
+          int THREADS_PER_CTA = BlockSizeX *BlockSizeY>
+__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
+    Functor act_grad,
+    const T *dout,
+    const MaskType *mask,
+    const T *src,
+    const T *bias,
+    const T factor,
+    const int64_t rows,
+    const int64_t cols,
+    T *dx,
+    T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
   using LoadT = phi::AlignedVector<T, VecSize>;

diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
@@ -35,9 +35,9 @@
     if os.name == 'nt':
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
         # Will load shared library from 'path' on windows
-        os.environ[
-            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
-                'path']
+        os.environ['path'] = (
+            current_path + ';' + third_lib_path + ';' + os.environ['path']
+        )
         sys.path.insert(0, third_lib_path)
         # Note: from python3.8, PATH will not take effect
         # https://github.com/python/cpython/pull/12302
@@ -47,20 +47,24 @@
 
 except ImportError as e:
     from .. import compat as cpt
+
     if os.name == 'nt':
         executable_path = os.path.abspath(os.path.dirname(sys.executable))
         raise ImportError(
             """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
         if you encounters \"DLL load failed\" errors. If you have python
         installed in other directory, replace \"%s\" with your own
-        directory. The original error is: \n %s""" %
-            (executable_path, executable_path, cpt.get_exception_message(e)))
+        directory. The original error is: \n %s"""
+            % (executable_path, executable_path, cpt.get_exception_message(e))
+        )
     else:
         raise ImportError(
             """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
         if you encounters \"libmkldnn.so not found\" errors. If you have python
         installed in other directory, replace \"/usr/local/lib\" with your own
-        directory. The original error is: \n""" + cpt.get_exception_message(e))
+        directory. The original error is: \n"""
+            + cpt.get_exception_message(e)
+        )
 except Exception as e:
     raise e
 
@@ -70,36 +74,45 @@ def avx_supported():
     Whether current system(Linux, MacOS, Windows) is supported with AVX.
     """
     from .. import compat as cpt
+
     sysstr = platform.system().lower()
     has_avx = False
     if sysstr == 'linux':
         try:
-            has_avx = os.popen('cat /proc/cpuinfo | grep -i avx').read() != ''
+            pipe = os.popen('cat /proc/cpuinfo | grep -i avx')
+            has_avx = pipe.read() != ''
+            pipe.close()
         except Exception as e:
-            sys.stderr.write('Can not get the AVX flag from /proc/cpuinfo.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
+            sys.stderr.write(
+                'Can not get the AVX flag from /proc/cpuinfo.\n'
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
         return has_avx
     elif sysstr == 'darwin':
         try:
-            has_avx = os.popen(
-                'sysctl machdep.cpu.features | grep -i avx').read() != ''
+            pipe = os.popen('sysctl machdep.cpu.features | grep -i avx')
+            has_avx = pipe.read() != ''
+            pipe.close()
         except Exception as e:
             sys.stderr.write(
                 'Can not get the AVX flag from machdep.cpu.features.\n'
-                'The original error is: %s\n' % cpt.get_exception_message(e))
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
         if not has_avx:
             import subprocess
+
             pipe = subprocess.Popen(
                 'sysctl machdep.cpu.leaf7_features | grep -i avx',
                 shell=True,
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE)
+                stderr=subprocess.PIPE,
+            )
             _ = pipe.communicate()
             has_avx = True if pipe.returncode == 0 else False
         return has_avx
     elif sysstr == 'windows':
         import ctypes
+
         ONE_PAGE = ctypes.c_size_t(0x1000)
 
         def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
@@ -109,24 +122,31 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             pfnVirtualAlloc.restype = ctypes.c_void_p
             MEM_COMMIT = ctypes.c_ulong(0x1000)
             PAGE_READWRITE = ctypes.c_ulong(0x4)
-            address = pfnVirtualAlloc(None, ONE_PAGE, MEM_COMMIT,
-                                      PAGE_READWRITE)
+            address = pfnVirtualAlloc(
+                None, ONE_PAGE, MEM_COMMIT, PAGE_READWRITE
+            )
             if not address:
                 raise Exception("Failed to VirtualAlloc")
 
             # Copy the code into the memory segment
-            memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p,
-                                       ctypes.c_void_p,
-                                       ctypes.c_size_t)(ctypes._memmove_addr)
+            memmove = ctypes.CFUNCTYPE(
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_size_t,
+            )(ctypes._memmove_addr)
             if memmove(address, code_str, len(code_str)) < 0:
                 raise Exception("Failed to memmove")
 
             # Enable execute permissions
             PAGE_EXECUTE = ctypes.c_ulong(0x10)
             pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
-            res = pfnVirtualProtect(ctypes.c_void_p(address),
-                                    ONE_PAGE, PAGE_EXECUTE,
-                                    ctypes.byref(ctypes.c_ulong(0)))
+            res = pfnVirtualProtect(
+                ctypes.c_void_p(address),
+                ONE_PAGE,
+                PAGE_EXECUTE,
+                ctypes.byref(ctypes.c_ulong(0)),
+            )
             if not res:
                 raise Exception("Failed VirtualProtect")
 
@@ -135,7 +155,8 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             pfnGetCurrentProcess.restype = ctypes.c_void_p
             prochandle = ctypes.c_void_p(pfnGetCurrentProcess())
             res = ctypes.windll.kernel32.FlushInstructionCache(
-                prochandle, ctypes.c_void_p(address), ONE_PAGE)
+                prochandle, ctypes.c_void_p(address), ONE_PAGE
+            )
             if not res:
                 raise Exception("Failed FlushInstructionCache")
 
@@ -153,12 +174,14 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             # Convert the code_str into a function that returns uint
             func, address = asm_func(code_str)
             retval = func()
-            ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address),
-                                               ctypes.c_size_t(0), ONE_PAGE)
+            ctypes.windll.kernel32.VirtualFree(
+                ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE
+            )
         except Exception as e:
-            sys.stderr.write('Failed getting the AVX flag on Windows.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
+            sys.stderr.write(
+                'Failed getting the AVX flag on Windows.\n'
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
         return (retval & (1 << avx_bit)) > 0
     else:
         sys.stderr.write('Do not get AVX flag on %s\n' % sysstr)
@@ -167,10 +190,10 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
 
 def run_shell_command(cmd):
     import subprocess
-    out, err = subprocess.Popen(cmd,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                shell=True).communicate()
+
+    out, err = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
+    ).communicate()
     if err:
         return None
     else:
@@ -179,8 +202,9 @@ def run_shell_command(cmd):
 
 def get_dso_path(core_so, dso_name):
     if core_so and dso_name:
-        return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
-                                 (core_so, dso_name))
+        return run_shell_command(
+            "ldd %s|grep %s|awk '{print $3}'" % (core_so, dso_name)
+        )
     else:
         return None
 
@@ -189,6 +213,7 @@ def load_dso(dso_absolute_path):
     if dso_absolute_path:
         try:
             from ctypes import cdll
+
             cdll.LoadLibrary(dso_absolute_path)
         except:
             warnings.warn("Load {} failed".format(dso_absolute_path))
@@ -247,12 +272,14 @@ def to_list(s):
 
 try:
     from . import libpaddle
+
     if avx_supported() and not libpaddle.is_compiled_with_avx():
         sys.stderr.write(
             "Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
             "Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
             "reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
-            "to get better performance.\n")
+            "to get better performance.\n"
+        )
 
     # assign tensor alias
     libpaddle.LoDTensor = libpaddle.Tensor
@@ -283,6 +310,7 @@ def to_list(s):
     from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
     from .libpaddle import _set_current_stream
     from .libpaddle import _get_phi_kernel_name
+
     if sys.platform != 'win32':
         from .libpaddle import _set_process_pids
         from .libpaddle import _erase_process_pids
@@ -295,12 +323,18 @@ def to_list(s):
 except Exception as e:
     if has_paddle_dy_lib:
         sys.stderr.write(
-            'Error: Can not import paddle core while this file exists: ' +
-            current_path + os.sep + 'libpaddle.' + dy_lib_suffix + '\n')
+            'Error: Can not import paddle core while this file exists: '
+            + current_path
+            + os.sep
+            + 'libpaddle.'
+            + dy_lib_suffix
+            + '\n'
+        )
     if not avx_supported() and libpaddle.is_compiled_with_avx():
         sys.stderr.write(
             "Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
-            "you should reinstall paddlepaddle with no-avx core.\n")
+            "you should reinstall paddlepaddle with no-avx core.\n"
+        )
     raise e
 
 
@@ -317,22 +351,26 @@ def set_paddle_custom_device_lib_path(lib_path):
 
 # set paddle lib path
 def set_paddle_lib_path():
-    site_dirs = site.getsitepackages() if hasattr(
-        site,
-        'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
+    site_dirs = (
+        site.getsitepackages()
+        if hasattr(site, 'getsitepackages')
+        else [x for x in sys.path if 'site-packages' in x]
+    )
     for site_dir in site_dirs:
         lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
         if os.path.exists(lib_dir):
             _set_paddle_lib_path(lib_dir)
             set_paddle_custom_device_lib_path(
-                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
+            )
             return
     if hasattr(site, 'USER_SITE'):
         lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
         if os.path.exists(lib_dir):
             _set_paddle_lib_path(lib_dir)
             set_paddle_custom_device_lib_path(
-                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
+            )
 
 
 set_paddle_lib_path()