diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu rename to deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu rename to deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu rename to deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu rename to deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu rename to deepspeed/inference/v2/kernels/ragged_ops/embed/embed_cuda.cu diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu rename to deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu rename to deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu rename to deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu rename to deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cu b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu similarity index 100% rename from deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cu rename to deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu diff --git a/op_builder/builder.py b/op_builder/builder.py index fec39f2b4feb..dd77f967cc60 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -453,9 +453,10 @@ def deepspeed_src_path(self, code_path): def builder(self): from torch.utils.cpp_extension import CppExtension + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] return CppExtension(name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), + include_dirs=include_dirs, extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())}, extra_link_args=self.strip_empty_entries(self.extra_ldflags())) @@ -638,7 +639,7 @@ def builder(self): from torch.utils.cpp_extension import CppExtension as ExtensionBuilder else: from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder - + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \ {'cxx': self.strip_empty_entries(self.cxx_args()), \ 'nvcc': self.strip_empty_entries(self.nvcc_args())} @@ -651,7 +652,7 @@ def builder(self): cuda_ext = ExtensionBuilder(name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), + include_dirs=include_dirs, libraries=self.strip_empty_entries(self.libraries_args()), extra_compile_args=compile_args, extra_link_args=self.strip_empty_entries(self.extra_ldflags())) @@ -702,11 +703,18 @@ def nvcc_args(self): '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR ] else: + try: + nvcc_threads = int(os.getenv("DS_NVCC_THREADS", "")) + if nvcc_threads <= 0: + raise ValueError("") + except ValueError: + nvcc_threads = min(os.cpu_count(), 8) + cuda_major, _ = installed_cuda_version() args += [ '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', '-std=c++17' if cuda_major > 10 else '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__' + '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', f'--threads={nvcc_threads}' ] if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1': args.append('--ptxas-options=-v') diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py index f6a71c7d1971..d2bc8eacfa25 100644 --- a/op_builder/cpu/builder.py +++ b/op_builder/cpu/builder.py @@ -3,6 +3,8 @@ # DeepSpeed Team +import os + try: # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed # if successful this also means we're doing a local install and not JIT compile path @@ -16,12 +18,12 @@ class CPUOpBuilder(OpBuilder): def builder(self): from torch.utils.cpp_extension import CppExtension as ExtensionBuilder - + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} cpp_ext = ExtensionBuilder(name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), + include_dirs=include_dirs, libraries=self.strip_empty_entries(self.libraries_args()), extra_compile_args=compile_args) diff --git a/op_builder/hpu/builder.py b/op_builder/hpu/builder.py index 5a538c84040c..3c86128fffd6 100644 --- a/op_builder/hpu/builder.py +++ b/op_builder/hpu/builder.py @@ -4,6 +4,8 @@ # DeepSpeed Team +import os + try: # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed # if successful this also means we're doing a local install and not JIT compile path @@ -17,12 +19,12 @@ class CPUOpBuilder(OpBuilder): def builder(self): from torch.utils.cpp_extension import CppExtension as ExtensionBuilder - + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} cpp_ext = ExtensionBuilder(name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), + include_dirs=include_dirs, libraries=self.strip_empty_entries(self.libraries_args()), extra_compile_args=compile_args) diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py index 229b500bebda..8073b63ad16b 100755 --- a/op_builder/inference_core_ops.py +++ b/op_builder/inference_core_ops.py @@ -60,13 +60,13 @@ def sources(self): sources = [ "inference/v2/kernels/core_ops/core_ops.cpp", "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp", - "inference/v2/kernels/core_ops/bias_activations/bias_activation.cu", + "inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu", "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp", - "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu", + "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu", "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp", - "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu", + "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu", "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp", - "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu", + "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu", ] prefix = self.get_prefix() diff --git a/op_builder/ragged_ops.py b/op_builder/ragged_ops.py index 8cb372e96c37..ec7cab91885f 100644 --- a/op_builder/ragged_ops.py +++ b/op_builder/ragged_ops.py @@ -63,18 +63,18 @@ def sources(self): "inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp", "inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp", "inference/v2/kernels/ragged_ops/embed/embed.cpp", - "inference/v2/kernels/ragged_ops/embed/embed.cu", + "inference/v2/kernels/ragged_ops/embed/embed_cuda.cu", "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp", - "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu", + "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu", "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp", - "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu", + "inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu", "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp", - "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu", + "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu", "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp", - "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu", + "inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu", "inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp", "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp", - "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cu", + "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu", ] prefix = self.get_prefix() diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py index 2f0feba1d4e1..459dcce6bfae 100644 --- a/op_builder/xpu/builder.py +++ b/op_builder/xpu/builder.py @@ -23,11 +23,11 @@ def builder(self): from intel_extension_for_pytorch.xpu.cpp_extension import DPCPPExtension except ImportError: from intel_extension_for_pytorch.xpu.utils import DPCPPExtension - + include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())] print("dpcpp sources = {}".format(self.sources())) dpcpp_ext = DPCPPExtension(name=self.absolute_name(), sources=self.strip_empty_entries(self.sources()), - include_dirs=self.strip_empty_entries(self.include_paths()), + include_dirs=include_dirs, extra_compile_args={ 'cxx': self.strip_empty_entries(self.cxx_args()), }, diff --git a/setup.py b/setup.py index d9aed9b47bd8..418c1f11a0e0 100755 --- a/setup.py +++ b/setup.py @@ -119,7 +119,8 @@ def get_env_if_set(key, default: typing.Any = ""): # For any pre-installed ops force disable ninja. if torch_available: from accelerator import get_accelerator - cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False) + use_ninja = not is_env_set("DS_DISABLE_NINJA") + cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja) if torch_available: TORCH_MAJOR = torch.__version__.split('.')[0]