Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 87 additions & 3 deletions unsloth_zoo/saving_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,15 +792,99 @@ def _merge_and_overwrite_lora(
tensors[key] = resized[key]
else:
tensors[key] = f.get_tensor(key)
save_file(tensors, filename_original)
del tensors

# POSIX: direct save. Windows: temp-file + os.replace to
# avoid WinError 1224 (mmap section release can lag).
if os.name != "nt":
save_file(tensors, filename_original)
del tensors
else:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()

max_retries = 5
base_delay = 0.2 # seconds
temp_dir = os.path.dirname(os.path.abspath(filename_original))
try:
original_mode = os.stat(filename_original).st_mode
except OSError:
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
original_mode = None

fd, tmp_path = tempfile.mkstemp(dir=temp_dir, suffix=".safetensors.tmp")
os.close(fd)

try:
save_file(tensors, tmp_path)
if original_mode is not None:
try:
os.chmod(tmp_path, original_mode)
except OSError:
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
pass

# Drop mmap refs before os.replace
del tensors
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()

for attempt in range(max_retries):
try:
os.replace(tmp_path, filename_original)
tmp_path = None
break
except OSError as e:
winerror = getattr(e, "winerror", None)
error_msg = str(e).lower()
is_lock_error = (
winerror in {32, 1224}
or (
winerror == 5 and (
"user-mapped" in error_msg
or "being used by another process" in error_msg
or "sharing violation" in error_msg
)
)
or "user-mapped" in error_msg
or "being used by another process" in error_msg
)
if is_lock_error and attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
if UNSLOTH_ENABLE_LOGGING:
logger.warning(
f"[Retry {attempt + 1}/{max_retries}] Windows file lock "
f"detected for {filename_original}: {e}. "
f"Waiting {delay:.1f}s before retry..."
)
gc.collect()
time.sleep(delay)
continue
if is_lock_error:
raise RuntimeError(
f"Failed to rewrite {filename_original} after {max_retries} "
f"attempts due to Windows file lock. Original shard is intact "
f"(atomic replace never committed). "
f"Solutions: 1) Restart Unsloth Studio 2) Disable antivirus "
f"3) Close File Explorer windows"
) from e
raise RuntimeError(
f"Model merge failed while rewriting {filename_original}: {e}"
) from e
finally:
if tmp_path is not None and os.path.exists(tmp_path):
try:
os.remove(tmp_path)
except OSError:
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
pass

if torch.cuda.is_available():
torch.cuda.empty_cache()
return count, safetensor_keys_seen

except RuntimeError:
raise
except Exception as e:
raise RuntimeError(f"Model merge failed with error: {e}")
raise RuntimeError(f"Model merge failed with error: {e}") from e

finally:
# Cleanup memory mapping
Expand Down
Loading