Skip to content

Commit

Permalink
Cherrypick NV fixes to release/2.4 (#48263)
Browse files Browse the repository at this point in the history
* Reduce squeeze2_matmul_fuse_pass, flattent tests time (#47098)

* Add missing fp32 config and reduce the testing combination

* Reduce trt matmul pass test max examples

* Loose TRT fp16 tests tolerance (#47100)

* Loose TRT half test tolerance to 1e-3 (#47101)

* Loose TRT half test tolerance to 1e-3 (#47106)

* Update distributed_strategy.proto (#46531)

* Close popen pipe after used (#47053)

* Add launch_bounds (#47285)

* Fix TRT UT failures (#47488)

* Format cherry-picked commits

* CudnnNormConvolution is no longer supported on NVIDIA Hopper GPUs (#48203)

* Skip tests that use fused_ops on H100

* Add error message to FusedOps on H100

Co-authored-by: Shijie <[email protected]>
Co-authored-by: Leo Chen <[email protected]>
Co-authored-by: Tian Zheng <[email protected]>
  • Loading branch information
4 people authored Nov 28, 2022
1 parent a2f61fe commit 7a0b862
Show file tree
Hide file tree
Showing 53 changed files with 3,129 additions and 2,744 deletions.
1 change: 1 addition & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ message BuildStrategy {
optional bool allow_cuda_graph_capture = 14 [ default = false ];
optional int32 reduce_strategy = 15 [ default = 0 ];
optional bool fuse_gemm_epilogue = 16 [ default = false ];
optional string debug_graphviz_path = 17;
}

message ExecutionStrategy {
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ struct NormConvolutionArgs {
int stride,
int dilation,
int group) {
PADDLE_ENFORCE_LT(
ctx.GetComputeCapability(),
90,
phi::errors::PreconditionNotMet(
"Expect compute compatiblity to be less than 90, but got %d. "
"CUDNN FusedOps is no longer available on H100 and later "
"devices.",
ctx.GetComputeCapability()));
PADDLE_ENFORCE_EQ(
input_shape.size(),
4U,
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() < 70) {
if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true),
Expand Down Expand Up @@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() < 70) {
if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true),
Expand Down Expand Up @@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() < 70) {
if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true),
Expand Down Expand Up @@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() <= 70) {
if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);
Expand Down
24 changes: 13 additions & 11 deletions paddle/fluid/operators/fused/fused_dropout_act_bias.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,17 +256,19 @@ template <typename T,
int BlockSizeX,
int BlockSizeY,
int VecSize,
typename Functor>
__global__ void FusedDropoutActBiasGrad(Functor act_grad,
const T *dout,
const MaskType *mask,
const T *src,
const T *bias,
const T factor,
const int64_t rows,
const int64_t cols,
T *dx,
T *dbias) {
typename Functor,
int THREADS_PER_CTA = BlockSizeX *BlockSizeY>
__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
Functor act_grad,
const T *dout,
const MaskType *mask,
const T *src,
const T *bias,
const T factor,
const int64_t rows,
const int64_t cols,
T *dx,
T *dbias) {
int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;

using LoadT = phi::AlignedVector<T, VecSize>;
Expand Down
124 changes: 81 additions & 43 deletions python/paddle/fluid/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
if os.name == 'nt':
third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
# Will load shared library from 'path' on windows
os.environ[
'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
'path']
os.environ['path'] = (
current_path + ';' + third_lib_path + ';' + os.environ['path']
)
sys.path.insert(0, third_lib_path)
# Note: from python3.8, PATH will not take effect
# https://github.com/python/cpython/pull/12302
Expand All @@ -47,20 +47,24 @@

except ImportError as e:
from .. import compat as cpt

if os.name == 'nt':
executable_path = os.path.abspath(os.path.dirname(sys.executable))
raise ImportError(
"""NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
if you encounters \"DLL load failed\" errors. If you have python
installed in other directory, replace \"%s\" with your own
directory. The original error is: \n %s""" %
(executable_path, executable_path, cpt.get_exception_message(e)))
directory. The original error is: \n %s"""
% (executable_path, executable_path, cpt.get_exception_message(e))
)
else:
raise ImportError(
"""NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
if you encounters \"libmkldnn.so not found\" errors. If you have python
installed in other directory, replace \"/usr/local/lib\" with your own
directory. The original error is: \n""" + cpt.get_exception_message(e))
directory. The original error is: \n"""
+ cpt.get_exception_message(e)
)
except Exception as e:
raise e

Expand All @@ -70,36 +74,45 @@ def avx_supported():
Whether current system(Linux, MacOS, Windows) is supported with AVX.
"""
from .. import compat as cpt

sysstr = platform.system().lower()
has_avx = False
if sysstr == 'linux':
try:
has_avx = os.popen('cat /proc/cpuinfo | grep -i avx').read() != ''
pipe = os.popen('cat /proc/cpuinfo | grep -i avx')
has_avx = pipe.read() != ''
pipe.close()
except Exception as e:
sys.stderr.write('Can not get the AVX flag from /proc/cpuinfo.\n'
'The original error is: %s\n' %
cpt.get_exception_message(e))
sys.stderr.write(
'Can not get the AVX flag from /proc/cpuinfo.\n'
'The original error is: %s\n' % cpt.get_exception_message(e)
)
return has_avx
elif sysstr == 'darwin':
try:
has_avx = os.popen(
'sysctl machdep.cpu.features | grep -i avx').read() != ''
pipe = os.popen('sysctl machdep.cpu.features | grep -i avx')
has_avx = pipe.read() != ''
pipe.close()
except Exception as e:
sys.stderr.write(
'Can not get the AVX flag from machdep.cpu.features.\n'
'The original error is: %s\n' % cpt.get_exception_message(e))
'The original error is: %s\n' % cpt.get_exception_message(e)
)
if not has_avx:
import subprocess

pipe = subprocess.Popen(
'sysctl machdep.cpu.leaf7_features | grep -i avx',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stderr=subprocess.PIPE,
)
_ = pipe.communicate()
has_avx = True if pipe.returncode == 0 else False
return has_avx
elif sysstr == 'windows':
import ctypes

ONE_PAGE = ctypes.c_size_t(0x1000)

def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
Expand All @@ -109,24 +122,31 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
pfnVirtualAlloc.restype = ctypes.c_void_p
MEM_COMMIT = ctypes.c_ulong(0x1000)
PAGE_READWRITE = ctypes.c_ulong(0x4)
address = pfnVirtualAlloc(None, ONE_PAGE, MEM_COMMIT,
PAGE_READWRITE)
address = pfnVirtualAlloc(
None, ONE_PAGE, MEM_COMMIT, PAGE_READWRITE
)
if not address:
raise Exception("Failed to VirtualAlloc")

# Copy the code into the memory segment
memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_size_t)(ctypes._memmove_addr)
memmove = ctypes.CFUNCTYPE(
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_size_t,
)(ctypes._memmove_addr)
if memmove(address, code_str, len(code_str)) < 0:
raise Exception("Failed to memmove")

# Enable execute permissions
PAGE_EXECUTE = ctypes.c_ulong(0x10)
pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
res = pfnVirtualProtect(ctypes.c_void_p(address),
ONE_PAGE, PAGE_EXECUTE,
ctypes.byref(ctypes.c_ulong(0)))
res = pfnVirtualProtect(
ctypes.c_void_p(address),
ONE_PAGE,
PAGE_EXECUTE,
ctypes.byref(ctypes.c_ulong(0)),
)
if not res:
raise Exception("Failed VirtualProtect")

Expand All @@ -135,7 +155,8 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
pfnGetCurrentProcess.restype = ctypes.c_void_p
prochandle = ctypes.c_void_p(pfnGetCurrentProcess())
res = ctypes.windll.kernel32.FlushInstructionCache(
prochandle, ctypes.c_void_p(address), ONE_PAGE)
prochandle, ctypes.c_void_p(address), ONE_PAGE
)
if not res:
raise Exception("Failed FlushInstructionCache")

Expand All @@ -153,12 +174,14 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
# Convert the code_str into a function that returns uint
func, address = asm_func(code_str)
retval = func()
ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address),
ctypes.c_size_t(0), ONE_PAGE)
ctypes.windll.kernel32.VirtualFree(
ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE
)
except Exception as e:
sys.stderr.write('Failed getting the AVX flag on Windows.\n'
'The original error is: %s\n' %
cpt.get_exception_message(e))
sys.stderr.write(
'Failed getting the AVX flag on Windows.\n'
'The original error is: %s\n' % cpt.get_exception_message(e)
)
return (retval & (1 << avx_bit)) > 0
else:
sys.stderr.write('Do not get AVX flag on %s\n' % sysstr)
Expand All @@ -167,10 +190,10 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):

def run_shell_command(cmd):
import subprocess
out, err = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True).communicate()

out, err = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
).communicate()
if err:
return None
else:
Expand All @@ -179,8 +202,9 @@ def run_shell_command(cmd):

def get_dso_path(core_so, dso_name):
if core_so and dso_name:
return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
(core_so, dso_name))
return run_shell_command(
"ldd %s|grep %s|awk '{print $3}'" % (core_so, dso_name)
)
else:
return None

Expand All @@ -189,6 +213,7 @@ def load_dso(dso_absolute_path):
if dso_absolute_path:
try:
from ctypes import cdll

cdll.LoadLibrary(dso_absolute_path)
except:
warnings.warn("Load {} failed".format(dso_absolute_path))
Expand Down Expand Up @@ -247,12 +272,14 @@ def to_list(s):

try:
from . import libpaddle

if avx_supported() and not libpaddle.is_compiled_with_avx():
sys.stderr.write(
"Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
"Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
"reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
"to get better performance.\n")
"to get better performance.\n"
)

# assign tensor alias
libpaddle.LoDTensor = libpaddle.Tensor
Expand Down Expand Up @@ -283,6 +310,7 @@ def to_list(s):
from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
from .libpaddle import _set_current_stream
from .libpaddle import _get_phi_kernel_name

if sys.platform != 'win32':
from .libpaddle import _set_process_pids
from .libpaddle import _erase_process_pids
Expand All @@ -295,12 +323,18 @@ def to_list(s):
except Exception as e:
if has_paddle_dy_lib:
sys.stderr.write(
'Error: Can not import paddle core while this file exists: ' +
current_path + os.sep + 'libpaddle.' + dy_lib_suffix + '\n')
'Error: Can not import paddle core while this file exists: '
+ current_path
+ os.sep
+ 'libpaddle.'
+ dy_lib_suffix
+ '\n'
)
if not avx_supported() and libpaddle.is_compiled_with_avx():
sys.stderr.write(
"Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
"you should reinstall paddlepaddle with no-avx core.\n")
"you should reinstall paddlepaddle with no-avx core.\n"
)
raise e


Expand All @@ -317,22 +351,26 @@ def set_paddle_custom_device_lib_path(lib_path):

# set paddle lib path
def set_paddle_lib_path():
site_dirs = site.getsitepackages() if hasattr(
site,
'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
site_dirs = (
site.getsitepackages()
if hasattr(site, 'getsitepackages')
else [x for x in sys.path if 'site-packages' in x]
)
for site_dir in site_dirs:
lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
if os.path.exists(lib_dir):
_set_paddle_lib_path(lib_dir)
set_paddle_custom_device_lib_path(
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
)
return
if hasattr(site, 'USER_SITE'):
lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
if os.path.exists(lib_dir):
_set_paddle_lib_path(lib_dir)
set_paddle_custom_device_lib_path(
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
)


set_paddle_lib_path()
Loading

0 comments on commit 7a0b862

Please sign in to comment.