diff --git a/unsloth_zoo/saving_utils.py b/unsloth_zoo/saving_utils.py index 4800306ff..791dd9316 100644 --- a/unsloth_zoo/saving_utils.py +++ b/unsloth_zoo/saving_utils.py @@ -792,7 +792,82 @@ def _merge_and_overwrite_lora( tensors[key] = resized[key] else: tensors[key] = f.get_tensor(key) - save_file(tensors, filename_original) + + # Fix for Windows file locking (os error 1224) + # Use retry logic with safe atomic operations + import tempfile + import shutil + + # Import safetensors exception to catch wrapped Windows errors + try: + from safetensors.torch import SafetensorError + except ImportError: + SafetensorError = Exception # Fallback if not available + + max_retries = 10 + base_delay = 0.2 # seconds + temp_dir = os.path.dirname(filename_original) + + for attempt in range(max_retries): + try: + # Force garbage collection and CUDA cache cleanup + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Create temp file in same directory for atomic replace + with tempfile.NamedTemporaryFile( + delete=False, + dir=temp_dir, + suffix=".safetensors.tmp" + ) as tmp_file: + tmp_path = tmp_file.name + + try: + # Write to temp file (safe - original untouched) + save_file(tensors, tmp_path) + + # Only delete original after successful write + if os.path.exists(filename_original): + os.remove(filename_original) + + # Move temp to original location (atomic) + shutil.move(tmp_path, filename_original) + break # Success + + except Exception as write_error: + # Clean up temp file on write failure + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except: + pass + raise write_error + + except (OSError, IOError, SafetensorError) as e: + # Catch both OS errors and safetensors-wrapped Windows errors + error_msg = str(e).lower() + is_lock_error = "1224" in error_msg or "user-mapped" in error_msg or "cannot be performed" in error_msg + + if is_lock_error and attempt < max_retries - 1: + # Exponential backoff for lock errors + delay = base_delay * (2 ** (attempt // 2)) + if UNSLOTH_ENABLE_LOGGING: + logger.warning( + f"[Retry {attempt + 1}/{max_retries}] Windows file lock detected: {e}. " + f"Waiting {delay:.1f}s before retry..." + ) + time.sleep(delay) + elif is_lock_error and attempt == max_retries - 1: + raise RuntimeError( + f"Failed to save file after {max_retries} attempts due to Windows file lock. " + "Original shard preserved - no data loss. " + "Solutions: 1) Restart Unsloth Studio 2) Disable antivirus 3) Close File Explorer windows" + ) + else: + # Non-lock errors - fail immediately + raise RuntimeError(f"Model merge failed with error: {e}") + del tensors if torch.cuda.is_available():