[None][fix] fix ci (#6814)

QiJune · web-flow · commit 8845e0f0654d · 2025-08-12T02:21:50.000-07:00
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -54,6 +54,6 @@ if(NOT WIN32)
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -55,6 +55,6 @@ if(NOT WIN32)
     ${TRTLLM_PYBIND_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -71,9 +71,8 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
 # Install OpenCV with FFMPEG support
-RUN pip3 uninstall -y opencv && \
-    rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
-    pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
+RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
+RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
 
 # WARs against security issues inherited from pytorch:25.06
 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
@@ -27,7 +27,7 @@
 from shutil import copy, copytree, rmtree
 from subprocess import DEVNULL, CalledProcessError, check_output, run
 from textwrap import dedent
-from typing import Sequence
+from typing import List
 
 try:
     from packaging.requirements import Requirement
@@ -120,8 +120,7 @@ def create_venv(project_dir: Path):
     return venv_prefix
 
 
-def setup_venv(project_dir: Path, requirements_file: Path,
-               no_venv: bool) -> tuple[Path, Path]:
+def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
     """Creates/updates a venv and installs requirements.
 
     Args:
@@ -280,111 +279,14 @@ def generate_fmha_cu(project_dir, venv_python):
     os.chdir(project_dir)
 
 
-def create_cuda_stub_links(cuda_stub_dir: str):
-    """
-  Creates symbolic links for CUDA stub libraries in the provided directory.
-
-  Args:
-      cuda_stub_dir (str): Path to the directory containing CUDA stubs.
-  """
-    cuda_stub_path = Path(cuda_stub_dir)
-    if not cuda_stub_path.exists():
-        raise RuntimeError(
-            f"CUDA stub directory '{cuda_stub_dir}' does not exist.")
-
-    shared_objects = ["cuda.so",
-                      "nvidia-ml.so"]  # List of shared object names to process.
-
-    for lib_name in shared_objects:
-        # Define the full paths for the library (.so) and its versioned link (.so.1).
-        so = cuda_stub_path / f"lib{lib_name}"  # e.g., libcuda.so
-        so_versioned = cuda_stub_path / f"lib{lib_name}.1"  # e.g., libcuda.so.1
-
-        # Check if the library exists and the versioned link does not.
-        if so.exists() and not so_versioned.exists():
-            try:
-                # Attempt to create the symbolic link.
-                so_versioned.symlink_to(so)
-            except PermissionError:
-                # Handle permission errors by attempting to use `sudo` to create the link.
-                try:
-                    build_run(f"sudo ln -s {str(so)} {str(so_versioned)}")
-                except CalledProcessError as sudo_error:
-                    print(
-                        f"Failed to create symbolic link even with sudo: {sudo_error}"
-                    )
-
-
-def generate_python_stubs_linux(binding_type: str, venv_python: Path,
-                                deep_ep: bool):
-    is_nanobind = binding_type == "nanobind"
-    package = "nanobind" if is_nanobind else "pybind11-stubgen"
-    build_run(f"\"{venv_python}\" -m pip install {package}")
-
-    env_stub_gen = os.environ.copy()
-    cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
-        "CUDA_PATH") or "/usr/local/cuda"
-    cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
-    ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
-    if Path(cuda_stub_dir).exists():
-        # Create symbolic links for the CUDA stubs
-        create_cuda_stub_links(cuda_stub_dir)
-        env_stub_gen[
-            "LD_LIBRARY_PATH"] = f"{ld_library_path}:{cuda_stub_dir}" if ld_library_path else cuda_stub_dir
-    if is_nanobind:
-        build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
-                  env=env_stub_gen)
-    else:
-        build_run(
-            f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
-            env=env_stub_gen)
-        build_run(
-            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
-            env=env_stub_gen)
-        if deep_ep:
-            build_run(
-                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
-                env=env_stub_gen)
-
-
-def generate_python_stubs_windows(binding_type: str, venv_python: Path,
-                                  pkg_dir: Path, lib_dir: Path):
-    if binding_type == "nanobind":
-        print("Windows not yet supported for nanobind stubs")
-        exit(1)
-    else:
-        build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
-        stubgen = "stubgen.py"
-        stubgen_contents = """
-                        # Loading torch, trt before bindings is required to avoid import errors on windows.
-                        # isort: off
-                        import torch
-                        import tensorrt as trt
-                        # isort: on
-                        import os
-                        import platform
-
-                        from pybind11_stubgen import main
-
-                        if __name__ == "__main__":
-                            # Load dlls from `libs` directory before launching bindings.
-                            if platform.system() == "Windows":
-                                os.add_dll_directory(r\"{lib_dir}\")
-                            main()
-                        """.format(lib_dir=lib_dir)
-        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
-        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
-        (pkg_dir / stubgen).unlink()
-
-
 def main(*,
          build_type: str = "Release",
          generator: str = "",
          build_dir: Path = None,
          dist_dir: Path = None,
          cuda_architectures: str = None,
          job_count: int = None,
-         extra_cmake_vars: Sequence[str] = tuple(),
+         extra_cmake_vars: List[str] = list(),
          extra_make_targets: str = "",
          trt_root: str = '/usr/local/tensorrt',
          nccl_root: str = None,
@@ -459,7 +361,7 @@ def main(*,
 
     if on_windows:
         # Windows does not support multi-device currently.
-        extra_cmake_vars += ["ENABLE_MULTI_DEVICE=0"]
+        extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
 
         # The Ninja CMake generator is used for our Windows build
         # (Easier than MSBuild to make compatible with our Docker image)
@@ -801,14 +703,81 @@ def get_binding_lib(subdirectory, name):
                      dirs_exist_ok=True)
 
         if not skip_stubs:
+            with working_directory(project_dir):
+                if binding_type == "nanobind":
+                    build_run(f"\"{venv_python}\" -m pip install nanobind")
+                else:
+                    build_run(
+                        f"\"{venv_python}\" -m pip install pybind11-stubgen")
             with working_directory(pkg_dir):
                 if on_windows:
-                    generate_python_stubs_windows(binding_type, venv_python,
-                                                  pkg_dir, lib_dir)
-                else:  # on linux
-                    generate_python_stubs_linux(
-                        binding_type, venv_python,
-                        bool(deep_ep_cuda_architectures))
+                    if binding_type == "nanobind":
+                        print("Windows not yet supported for nanobind stubs")
+                        exit(1)
+                    else:
+                        stubgen = "stubgen.py"
+                        stubgen_contents = """
+                        # Loading torch, trt before bindings is required to avoid import errors on windows.
+                        # isort: off
+                        import torch
+                        import tensorrt as trt
+                        # isort: on
+                        import os
+                        import platform
+
+                        from pybind11_stubgen import main
+
+                        if __name__ == "__main__":
+                            # Load dlls from `libs` directory before launching bindings.
+                            if platform.system() == "Windows":
+                                os.add_dll_directory(r\"{lib_dir}\")
+                            main()
+                        """.format(lib_dir=lib_dir)
+                        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
+                        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
+                        (pkg_dir / stubgen).unlink()
+                else:
+                    env_ld = os.environ.copy()
+
+                    new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real"
+                    if 'LD_LIBRARY_PATH' in env_ld:
+                        new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
+
+                    result = build_run("find /usr -name *libnvidia-ml.so*",
+                                       capture_output=True,
+                                       text=True)
+                    assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}"
+
+                    # Build containers only contain stub version of libnvidia-ml.so and not the real version.
+                    # If real version not in system, we need to create symbolic link to stub version to prevent import errors.
+                    if "libnvidia-ml.so.1" not in result.stdout:
+                        if "libnvidia-ml.so" in result.stdout:
+                            line = result.stdout.splitlines()[0]
+                            path = os.path.dirname(line)
+                            new_library_path += f":{path}"
+                            build_run(f"ln -s {line} {path}/libnvidia-ml.so.1")
+                        else:
+                            print(
+                                f"Failed to find libnvidia-ml.so: {result.stderr}",
+                                file=sys.stderr)
+                            exit(1)
+
+                    env_ld["LD_LIBRARY_PATH"] = new_library_path
+                    if binding_type == "nanobind":
+                        build_run(
+                            f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
+                            env=env_ld)
+                    else:
+                        build_run(
+                            f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
+                            env=env_ld)
+                        if deep_ep_cuda_architectures:
+                            build_run(
+                                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
+                                env=env_ld)
+                        build_run(
+                            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
+                            env=env_ld)
 
     if not skip_building_wheel:
         if dist_dir is None:

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,6 @@ if(NOT WIN32)`
`54`	`54`	`${TRTLLM_NB_MODULE}`
`55`	`55`	`PROPERTIES`
`56`	`56`	`LINK_FLAGS`
`57`		`- "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"`
	`57`	`+ "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"`
`58`	`58`	`)`
`59`	`59`	`endif()`
Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,6 @@ if(NOT WIN32)`
`55`	`55`	`${TRTLLM_PYBIND_MODULE}`
`56`	`56`	`PROPERTIES`
`57`	`57`	`LINK_FLAGS`
`58`		`- "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"`
	`58`	`+ "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"`
`59`	`59`	`)`
`60`	`60`	`endif()`