Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -994,9 +994,8 @@ intel = [
]
amd = [
"unsloth[huggingfacenotorch]",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl ; ('linux' in sys_platform) and (platform_machine == 'aarch64')",
"bitsandbytes>=0.49.1 ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64' or platform_machine == 'aarch64')",
"bitsandbytes>=0.49.1 ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
]

[project.urls]
Expand Down
37 changes: 16 additions & 21 deletions unsloth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,21 @@
fix_message_factory_issue,
check_fbgemm_gpu_version,
disable_broken_causal_conv1d,
_filter_rocm_amdgpu_ids_fd2_noise,
configure_amdgpu_asic_id_table_path,
torchvision_compatibility_check,
fix_diffusers_warnings,
fix_huggingface_hub,
)

# Configure libdrm ids table path early so ROCm can resolve AMD GPU names.
configure_amdgpu_asic_id_table_path()
disable_broken_causal_conv1d()
fix_message_factory_issue()
check_fbgemm_gpu_version()
torchvision_compatibility_check()
fix_diffusers_warnings()
fix_huggingface_hub()
del configure_amdgpu_asic_id_table_path
del disable_broken_causal_conv1d
del fix_message_factory_issue
del check_fbgemm_gpu_version
Expand Down Expand Up @@ -96,9 +99,7 @@
# os.system("pip install --upgrade --no-cache-dir --no-deps --user unsloth_zoo")
# except:
# raise ImportError("Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`")
# Filter native fd=2 amdgpu.ids noise during early unsloth_zoo import.
with _filter_rocm_amdgpu_ids_fd2_noise():
import unsloth_zoo
import unsloth_zoo
except PackageNotFoundError:
raise ImportError(
f"Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo` then retry!"
Expand All @@ -109,9 +110,7 @@

# Try importing PyTorch and check version
try:
# Filter native fd=2 amdgpu.ids noise during torch import on ROCm.
with _filter_rocm_amdgpu_ids_fd2_noise():
import torch
import torch
except ModuleNotFoundError:
raise ImportError(
"Unsloth: Pytorch is not installed. Go to https://pytorch.org/.\n"
Expand All @@ -120,16 +119,14 @@
except:
raise

# Filter native fd=2 amdgpu.ids noise during early device detection import.
with _filter_rocm_amdgpu_ids_fd2_noise():
from unsloth_zoo.device_type import (
is_hip,
get_device_type,
DEVICE_TYPE,
DEVICE_TYPE_TORCH,
DEVICE_COUNT,
ALLOW_PREQUANTIZED_MODELS,
)
from unsloth_zoo.device_type import (
is_hip,
get_device_type,
DEVICE_TYPE,
DEVICE_TYPE_TORCH,
DEVICE_COUNT,
ALLOW_PREQUANTIZED_MODELS,
)

# Fix other issues
from .import_fixes import (
Expand Down Expand Up @@ -305,10 +302,8 @@ def is_bf16_supported():
# TODO: check triton for intel installed properly.
pass

# Filter native fd=2 amdgpu.ids noise during model import startup.
with _filter_rocm_amdgpu_ids_fd2_noise():
from .models import *
from .models import __version__
from .models import *
from .models import __version__
from .save import *
from .chat_templates import *
from .tokenizer_utils import *
Expand Down
127 changes: 66 additions & 61 deletions unsloth/import_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import importlib.abc
import importlib.machinery
import importlib.util
import contextlib
from pathlib import Path
from importlib.metadata import version as importlib_version
from packaging.version import Version as TrueVersion
Expand All @@ -26,7 +25,6 @@
import warnings
import sys
import functools
import tempfile

# We cannot do from unsloth_zoo.log import logger since FBGEMM might cause seg faults.
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") in (
Expand Down Expand Up @@ -1191,6 +1189,13 @@ def disable_torchcodec_if_broken():
Path("/dev/kfd"),
Path("/sys/module/amdgpu"),
)
_AMDGPU_ASIC_ID_TABLE_PATH_ENV = "AMDGPU_ASIC_ID_TABLE_PATH"
_AMDGPU_ASIC_ID_CANDIDATE_PATHS = (
Path("/usr/share/libdrm/amdgpu.ids"),
Path("/usr/local/share/libdrm/amdgpu.ids"),
Path("/opt/rocm/share/libdrm/amdgpu.ids"),
Path("/opt/amdgpu/share/libdrm/amdgpu.ids"),
)


def _log_rocm_detection(message):
Expand Down Expand Up @@ -1236,68 +1241,70 @@ def _is_rocm_torch_build() -> bool:
return False


@contextlib.contextmanager
def _filter_stderr_fd(
suppressed_substrings = (_AMDGPU_IDS_MISSING_TEXT,),
):
"""
Capture low-level fd=2 writes, drop only known noisy substrings, and replay
everything else after the protected block.
"""
saved_stderr_fd = None
temp_file = None
redirected = False
def _iter_amdgpu_asic_id_table_candidates():
# Try torch-adjacent ids table paths first without importing torch.
try:
saved_stderr_fd = os.dup(2)
temp_file = tempfile.TemporaryFile(mode = "w+b")
os.dup2(temp_file.fileno(), 2)
redirected = True
torch_spec = importlib.util.find_spec("torch")
except Exception:
redirected = False
torch_spec = None

roots = []
if torch_spec is not None:
if torch_spec.origin:
roots.append(Path(torch_spec.origin).resolve().parent)
if torch_spec.submodule_search_locations:
for location in torch_spec.submodule_search_locations:
roots.append(Path(location).resolve())

seen = set()
for root in roots:
for candidate in (
root / "share" / "libdrm" / "amdgpu.ids",
root.parent / "share" / "libdrm" / "amdgpu.ids",
root.parent.parent / "share" / "libdrm" / "amdgpu.ids",
):
candidate_str = str(candidate)
if candidate_str in seen:
continue
seen.add(candidate_str)
yield candidate

for candidate in _AMDGPU_ASIC_ID_CANDIDATE_PATHS:
candidate_str = str(candidate)
if candidate_str in seen:
continue
seen.add(candidate_str)
yield candidate

try:
yield
finally:
captured = b""
if redirected and temp_file is not None:
try:
temp_file.flush()
temp_file.seek(0)
captured = temp_file.read()
except Exception:
captured = b""
if redirected and saved_stderr_fd is not None:
try:
os.dup2(saved_stderr_fd, 2)
except Exception:
pass
if captured and saved_stderr_fd is not None:
try:
for raw_line in captured.splitlines(keepends = True):
line = raw_line.decode("utf-8", errors = "ignore")
if any(s in line for s in suppressed_substrings):
continue
os.write(saved_stderr_fd, raw_line)
except Exception:
pass
if temp_file is not None:
try:
temp_file.close()
except Exception:
pass
if saved_stderr_fd is not None:
try:
os.close(saved_stderr_fd)
except Exception:
pass

def configure_amdgpu_asic_id_table_path():
# Honor an existing valid user-provided path.
configured = os.environ.get(_AMDGPU_ASIC_ID_TABLE_PATH_ENV, "").strip()
if configured:
configured_path = Path(configured)
try:
if configured_path.is_file():
return str(configured_path)
except Exception:
pass
Comment on lines +1285 to +1289
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The try...except Exception: pass block is overly broad. While it's good to handle potential issues with configured_path.is_file(), catching a generic Exception can mask other programming errors or unexpected issues. It's generally better to catch more specific exceptions like OSError or PermissionError that Path.is_file() might raise. If other exceptions are truly intended to be ignored, they should be logged for debugging purposes.

Suggested change
try:
if configured_path.is_file():
return str(configured_path)
except Exception:
pass
try:
if configured_path.is_file():
return str(configured_path)
except OSError as e:
logger.debug(f"Unsloth: Error checking configured AMDGPU_ASIC_ID_TABLE_PATH: {e}")


def _filter_rocm_amdgpu_ids_fd2_noise():
# ROCm/libdrm can emit amdgpu.ids missing errors via low-level fd=2 writes.
# Python-level stderr filters cannot intercept those writes.
# Only attempt this on ROCm-like environments.
if not _is_rocm_torch_build():
return contextlib.nullcontext()
return _filter_stderr_fd()
return None

for candidate in _iter_amdgpu_asic_id_table_candidates():
try:
if candidate.is_file():
os.environ[_AMDGPU_ASIC_ID_TABLE_PATH_ENV] = str(candidate)
if UNSLOTH_ENABLE_LOGGING:
logger.info(
f"Unsloth: Set {_AMDGPU_ASIC_ID_TABLE_PATH_ENV}={candidate}"
)
return str(candidate)
except Exception:
continue
Comment on lines +1296 to +1305
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to the previous comment, the try...except Exception: continue block is very broad. Catching a generic Exception can hide important issues that might occur during the candidate.is_file() check or when setting the environment variable. It's recommended to catch more specific exceptions (e.g., OSError for file system operations) or at least log the exception details if the intent is to ignore them and continue.

Suggested change
try:
if candidate.is_file():
os.environ[_AMDGPU_ASIC_ID_TABLE_PATH_ENV] = str(candidate)
if UNSLOTH_ENABLE_LOGGING:
logger.info(
f"Unsloth: Set {_AMDGPU_ASIC_ID_TABLE_PATH_ENV}={candidate}"
)
return str(candidate)
except Exception:
continue
try:
if candidate.is_file():
os.environ[_AMDGPU_ASIC_ID_TABLE_PATH_ENV] = str(candidate)
if UNSLOTH_ENABLE_LOGGING:
logger.info(
f"Unsloth: Set {_AMDGPU_ASIC_ID_TABLE_PATH_ENV}={candidate}"
)
return str(candidate)
except OSError as e:
logger.debug(f"Unsloth: Error checking candidate AMDGPU_ASIC_ID_TABLE_PATH '{candidate}': {e}")
continue


return None


def _is_causal_conv1d_name(module_name: str) -> bool:
Expand Down Expand Up @@ -1432,9 +1439,7 @@ def disable_broken_causal_conv1d():
return

try:
# Suppress only native fd=2 amdgpu.ids noise during causal_conv1d probe.
with _filter_rocm_amdgpu_ids_fd2_noise():
import causal_conv1d # noqa: F401
import causal_conv1d # noqa: F401

return
except Exception as error:
Expand Down