Skip to content

Commit 8845e0f

Browse files
authored
[None][fix] fix ci (#6814)
1 parent ab0d768 commit 8845e0f

File tree

4 files changed

+81
-113
lines changed

4 files changed

+81
-113
lines changed

cpp/tensorrt_llm/nanobind/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,6 @@ if(NOT WIN32)
5454
${TRTLLM_NB_MODULE}
5555
PROPERTIES
5656
LINK_FLAGS
57-
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
57+
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
5858
)
5959
endif()

cpp/tensorrt_llm/pybind/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,6 @@ if(NOT WIN32)
5555
${TRTLLM_PYBIND_MODULE}
5656
PROPERTIES
5757
LINK_FLAGS
58-
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
58+
"-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
5959
)
6060
endif()

docker/Dockerfile.multi

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,8 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
7171
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
7272

7373
# Install OpenCV with FFMPEG support
74-
RUN pip3 uninstall -y opencv && \
75-
rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
76-
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
74+
RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
75+
RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
7776

7877
# WARs against security issues inherited from pytorch:25.06
7978
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7

scripts/build_wheel.py

Lines changed: 77 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from shutil import copy, copytree, rmtree
2828
from subprocess import DEVNULL, CalledProcessError, check_output, run
2929
from textwrap import dedent
30-
from typing import Sequence
30+
from typing import List
3131

3232
try:
3333
from packaging.requirements import Requirement
@@ -120,8 +120,7 @@ def create_venv(project_dir: Path):
120120
return venv_prefix
121121

122122

123-
def setup_venv(project_dir: Path, requirements_file: Path,
124-
no_venv: bool) -> tuple[Path, Path]:
123+
def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
125124
"""Creates/updates a venv and installs requirements.
126125
127126
Args:
@@ -280,111 +279,14 @@ def generate_fmha_cu(project_dir, venv_python):
280279
os.chdir(project_dir)
281280

282281

283-
def create_cuda_stub_links(cuda_stub_dir: str):
284-
"""
285-
Creates symbolic links for CUDA stub libraries in the provided directory.
286-
287-
Args:
288-
cuda_stub_dir (str): Path to the directory containing CUDA stubs.
289-
"""
290-
cuda_stub_path = Path(cuda_stub_dir)
291-
if not cuda_stub_path.exists():
292-
raise RuntimeError(
293-
f"CUDA stub directory '{cuda_stub_dir}' does not exist.")
294-
295-
shared_objects = ["cuda.so",
296-
"nvidia-ml.so"] # List of shared object names to process.
297-
298-
for lib_name in shared_objects:
299-
# Define the full paths for the library (.so) and its versioned link (.so.1).
300-
so = cuda_stub_path / f"lib{lib_name}" # e.g., libcuda.so
301-
so_versioned = cuda_stub_path / f"lib{lib_name}.1" # e.g., libcuda.so.1
302-
303-
# Check if the library exists and the versioned link does not.
304-
if so.exists() and not so_versioned.exists():
305-
try:
306-
# Attempt to create the symbolic link.
307-
so_versioned.symlink_to(so)
308-
except PermissionError:
309-
# Handle permission errors by attempting to use `sudo` to create the link.
310-
try:
311-
build_run(f"sudo ln -s {str(so)} {str(so_versioned)}")
312-
except CalledProcessError as sudo_error:
313-
print(
314-
f"Failed to create symbolic link even with sudo: {sudo_error}"
315-
)
316-
317-
318-
def generate_python_stubs_linux(binding_type: str, venv_python: Path,
319-
deep_ep: bool):
320-
is_nanobind = binding_type == "nanobind"
321-
package = "nanobind" if is_nanobind else "pybind11-stubgen"
322-
build_run(f"\"{venv_python}\" -m pip install {package}")
323-
324-
env_stub_gen = os.environ.copy()
325-
cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
326-
"CUDA_PATH") or "/usr/local/cuda"
327-
cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
328-
ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
329-
if Path(cuda_stub_dir).exists():
330-
# Create symbolic links for the CUDA stubs
331-
create_cuda_stub_links(cuda_stub_dir)
332-
env_stub_gen[
333-
"LD_LIBRARY_PATH"] = f"{ld_library_path}:{cuda_stub_dir}" if ld_library_path else cuda_stub_dir
334-
if is_nanobind:
335-
build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
336-
env=env_stub_gen)
337-
else:
338-
build_run(
339-
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
340-
env=env_stub_gen)
341-
build_run(
342-
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
343-
env=env_stub_gen)
344-
if deep_ep:
345-
build_run(
346-
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
347-
env=env_stub_gen)
348-
349-
350-
def generate_python_stubs_windows(binding_type: str, venv_python: Path,
351-
pkg_dir: Path, lib_dir: Path):
352-
if binding_type == "nanobind":
353-
print("Windows not yet supported for nanobind stubs")
354-
exit(1)
355-
else:
356-
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
357-
stubgen = "stubgen.py"
358-
stubgen_contents = """
359-
# Loading torch, trt before bindings is required to avoid import errors on windows.
360-
# isort: off
361-
import torch
362-
import tensorrt as trt
363-
# isort: on
364-
import os
365-
import platform
366-
367-
from pybind11_stubgen import main
368-
369-
if __name__ == "__main__":
370-
# Load dlls from `libs` directory before launching bindings.
371-
if platform.system() == "Windows":
372-
os.add_dll_directory(r\"{lib_dir}\")
373-
main()
374-
""".format(lib_dir=lib_dir)
375-
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
376-
build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
377-
(pkg_dir / stubgen).unlink()
378-
379-
380282
def main(*,
381283
build_type: str = "Release",
382284
generator: str = "",
383285
build_dir: Path = None,
384286
dist_dir: Path = None,
385287
cuda_architectures: str = None,
386288
job_count: int = None,
387-
extra_cmake_vars: Sequence[str] = tuple(),
289+
extra_cmake_vars: List[str] = list(),
388290
extra_make_targets: str = "",
389291
trt_root: str = '/usr/local/tensorrt',
390292
nccl_root: str = None,
@@ -459,7 +361,7 @@ def main(*,
459361

460362
if on_windows:
461363
# Windows does not support multi-device currently.
462-
extra_cmake_vars += ["ENABLE_MULTI_DEVICE=0"]
364+
extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
463365

464366
# The Ninja CMake generator is used for our Windows build
465367
# (Easier than MSBuild to make compatible with our Docker image)
@@ -801,14 +703,81 @@ def get_binding_lib(subdirectory, name):
801703
dirs_exist_ok=True)
802704

803705
if not skip_stubs:
706+
with working_directory(project_dir):
707+
if binding_type == "nanobind":
708+
build_run(f"\"{venv_python}\" -m pip install nanobind")
709+
else:
710+
build_run(
711+
f"\"{venv_python}\" -m pip install pybind11-stubgen")
804712
with working_directory(pkg_dir):
805713
if on_windows:
806-
generate_python_stubs_windows(binding_type, venv_python,
807-
pkg_dir, lib_dir)
808-
else: # on linux
809-
generate_python_stubs_linux(
810-
binding_type, venv_python,
811-
bool(deep_ep_cuda_architectures))
714+
if binding_type == "nanobind":
715+
print("Windows not yet supported for nanobind stubs")
716+
exit(1)
717+
else:
718+
stubgen = "stubgen.py"
719+
stubgen_contents = """
720+
# Loading torch, trt before bindings is required to avoid import errors on windows.
721+
# isort: off
722+
import torch
723+
import tensorrt as trt
724+
# isort: on
725+
import os
726+
import platform
727+
728+
from pybind11_stubgen import main
729+
730+
if __name__ == "__main__":
731+
# Load dlls from `libs` directory before launching bindings.
732+
if platform.system() == "Windows":
733+
os.add_dll_directory(r\"{lib_dir}\")
734+
main()
735+
""".format(lib_dir=lib_dir)
736+
(pkg_dir / stubgen).write_text(dedent(stubgen_contents))
737+
build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
738+
(pkg_dir / stubgen).unlink()
739+
else:
740+
env_ld = os.environ.copy()
741+
742+
new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real"
743+
if 'LD_LIBRARY_PATH' in env_ld:
744+
new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
745+
746+
result = build_run("find /usr -name *libnvidia-ml.so*",
747+
capture_output=True,
748+
text=True)
749+
assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}"
750+
751+
# Build containers only contain stub version of libnvidia-ml.so and not the real version.
752+
# If real version not in system, we need to create symbolic link to stub version to prevent import errors.
753+
if "libnvidia-ml.so.1" not in result.stdout:
754+
if "libnvidia-ml.so" in result.stdout:
755+
line = result.stdout.splitlines()[0]
756+
path = os.path.dirname(line)
757+
new_library_path += f":{path}"
758+
build_run(f"ln -s {line} {path}/libnvidia-ml.so.1")
759+
else:
760+
print(
761+
f"Failed to find libnvidia-ml.so: {result.stderr}",
762+
file=sys.stderr)
763+
exit(1)
764+
765+
env_ld["LD_LIBRARY_PATH"] = new_library_path
766+
if binding_type == "nanobind":
767+
build_run(
768+
f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
769+
env=env_ld)
770+
else:
771+
build_run(
772+
f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
773+
env=env_ld)
774+
if deep_ep_cuda_architectures:
775+
build_run(
776+
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
777+
env=env_ld)
778+
build_run(
779+
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
780+
env=env_ld)
812781

813782
if not skip_building_wheel:
814783
if dist_dir is None:

0 commit comments

Comments
 (0)