bghira · bghira · Sep 28, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/OPTIONS.md b/OPTIONS.md
diff --git a/helpers/caching/memory.py b/helpers/caching/memory.py
@@ -3,11 +3,11 @@ def reclaim_memory():
     import torch
 
     if torch.cuda.is_available():
+        gc.collect()
         torch.cuda.empty_cache()
         torch.cuda.ipc_collect()
 
     if torch.backends.mps.is_available():
         torch.mps.empty_cache()
         torch.mps.synchronize()
-
-    gc.collect()
+        gc.collect()
diff --git a/helpers/configuration/cmd_args.py b/helpers/configuration/cmd_args.py
@@ -13,7 +13,7 @@
 from helpers.training import quantised_precision_levels
 from helpers.training.optimizer_param import (
     is_optimizer_deprecated,
-    is_optimizer_bf16,
+    is_optimizer_grad_fp32,
     map_deprecated_optimizer_parameter,
     optimizer_choices,
 )
@@ -1148,6 +1148,39 @@ def get_argument_parser():
             " For example, `--optimizer_config=decouple_lr=True,weight_decay=0.01`."
         ),
     )
+    parser.add_argument(
+        "--optimizer_cpu_offload_method",
+        choices=["none", "torchao"],
+        default="none",
+        help=(
+            "When loading an optimiser, a CPU offload mechanism can be used. Currently, no offload is used by default, and only torchao is supported."
+        ),
+    )
+    parser.add_argument(
+        "--optimizer_offload_gradients",
+        action="store_true",
+        default=False,
+        help=(
+            "When creating a CPU-offloaded optimiser, the gradients can be offloaded to the CPU to save more memory."
+        ),
+    )
+    parser.add_argument(
+        "--fuse_optimizer",
+        action="store_true",
+        default=False,
+        help=(
+            "When creating a CPU-offloaded optimiser, the fused optimiser could be used to save on memory, while running slightly slower."
+        ),
+    )
+    parser.add_argument(
+        "--optimizer_torch_compile",
+        action="store_true",
+        default=False,
+        help=(
+            "When using a CPU-offloaded optimiser, we can torch.compile() it and save some time using a compiled graph."
+            " This option will not work on Apple MPS devices, and may not work on all systems."
+        ),
+    )
     parser.add_argument(
         "--optimizer_beta1",
         type=float,
@@ -1282,8 +1315,8 @@ def get_argument_parser():
     )
     parser.add_argument(
         "--validation_torch_compile",
-        type=str,
-        default="false",
+        action="store_true",
+        default=False,
         help=(
             "Supply `--validation_torch_compile=true` to enable the use of torch.compile() on the validation pipeline."
             " For some setups, torch.compile() may error out. This is dependent on PyTorch version, phase of the moon,"
@@ -1984,6 +2017,21 @@ def parse_cmdline_args(input_args=None):
         raise ValueError(
             f"Model is not using bf16 precision, but the optimizer {chosen_optimizer} requires it."
         )
+    if is_optimizer_grad_fp32(args.optimizer):
+        print(
+            "[WARNING] Using a low-precision optimizer that requires fp32 gradients. Training will run more slowly."
+        )
+        if args.gradient_precision != "fp32":
+            print(
+                f"[WARNING] Overriding gradient_precision to 'fp32' for {args.optimizer} optimizer."
+            )
+            args.gradient_precision = "fp32"
+    else:
+        if args.gradient_precision == "fp32":
+            print(
+                f"[WARNING] Overriding gradient_precision to 'unmodified' for {args.optimizer} optimizer, as fp32 gradients are not required."
+            )
+            args.gradient_precision = "unmodified"
 
     if torch.backends.mps.is_available():
         if (
@@ -2001,6 +2049,12 @@ def parse_cmdline_args(input_args=None):
             )
             sys.exit(1)
 
+        if args.quantize_via == "accelerator":
+            error_log(
+                "MPS does not benefit from models being quantized on the accelerator device. Overriding --quantize_via to 'cpu'."
+            )
+            args.quantize_via = "cpu"
+
     if (
         args.max_train_steps is not None
         and args.max_train_steps > 0
@@ -2091,10 +2145,6 @@ def parse_cmdline_args(input_args=None):
 
     if args.metadata_update_interval < 60:
         raise ValueError("Metadata update interval must be at least 60 seconds.")
-    if args.validation_torch_compile == "true":
-        args.validation_torch_compile = True
-    else:
-        args.validation_torch_compile = False
 
     if args.model_family == "sd3":
         args.pretrained_vae_model_name_or_path = None
@@ -2247,9 +2297,7 @@ def parse_cmdline_args(input_args=None):
     )
     args.disable_accelerator = os.environ.get("SIMPLETUNER_DISABLE_ACCELERATOR", False)
 
-    if "lora" not in args.model_type:
-        args.base_model_precision = "no_change"
-    elif "lycoris" == args.lora_type.lower():
+    if "lycoris" == args.lora_type.lower():
         from lycoris import create_lycoris
 
         if args.lycoris_config is None:

diff --git a/helpers/training/__init__.py b/helpers/training/__init__.py
@@ -3,9 +3,13 @@
     # "fp4-bnb",
     # "fp8-bnb",
     "fp8-quanto",
+    "nf4-quanto",
     "int8-quanto",
     "int4-quanto",
     "int2-quanto",
+    # currently does not work.
+    # "fp8-torchao",
+    "int8-torchao",
 ]
 
 image_file_extensions = set(["jpg", "jpeg", "png", "webp", "bmp", "tiff", "tif"])

diff --git a/helpers/training/custom_schedule.py b/helpers/training/custom_schedule.py
@@ -156,6 +156,7 @@ def get_polynomial_decay_schedule_with_warmup(
 
     """
 
+    print(f"Optimizer: {optimizer}")
     lr_init = optimizer.defaults["lr"]
     if not (float(lr_init) > float(lr_end)):
         raise ValueError(

diff --git a/helpers/training/optimizer_param.py b/helpers/training/optimizer_param.py
@@ -17,6 +17,23 @@
 except:
     pass
 
+try:
+    from torchao.prototype.low_bit_optim import (
+        AdamW8bit as AOAdamW8Bit,
+        Adam4bit as AOAdamW4Bit,
+        AdamFp8 as AOAdamFp8,
+        AdamWFp8 as AOAdamWFp8,
+        CPUOffloadOptimizer as AOCPUOffloadOptimizer,
+    )
+
+    if torch.backends.mps.is_available():
+        import torch._dynamo
+
+        torch._dynamo.config.suppress_errors = True
+except Exception as e:
+    print("You need torchao installed for its low-precision optimizers.")
+    raise e
+
 try:
     import optimi
 
@@ -36,6 +53,46 @@
         },
         "class": AdamWBF16,
     },
+    "ao-adamw8bit": {
+        "gradient_precision": "bf16",
+        "precision": "any",
+        "default_settings": {
+            "betas": (0.9, 0.999),
+            "weight_decay": 1e-2,
+            "eps": 1e-6,
+        },
+        "class": AOAdamW8Bit,
+    },
+    "ao-adamw4bit": {
+        "gradient_precision": "bf16",
+        "precision": "any",
+        "default_settings": {
+            "betas": (0.9, 0.999),
+            "weight_decay": 1e-2,
+            "eps": 1e-6,
+        },
+        "class": AOAdamW4Bit,
+    },
+    "ao-adamfp8": {
+        "gradient_precision": "bf16",
+        "precision": "any",
+        "default_settings": {
+            "betas": (0.9, 0.999),
+            "weight_decay": 1e-2,
+            "eps": 1e-6,
+        },
+        "class": AOAdamFp8,
+    },
+    "ao-adamwfp8": {
+        "gradient_precision": "bf16",
+        "precision": "any",
+        "default_settings": {
+            "betas": (0.9, 0.999),
+            "weight_decay": 1e-2,
+            "eps": 1e-6,
+        },
+        "class": AOAdamWFp8,
+    },
     "adamw_schedulefree": {
         "precision": "any",
         "override_lr_scheduler": True,
@@ -276,6 +333,40 @@ def is_optimizer_bf16(optimizer: str) -> bool:
     return False
 
 
+def is_optimizer_grad_fp32(optimizer: str) -> bool:
+    optimizer_precision = optimizer_choices.get(optimizer, {}).get(
+        "gradient_precision", None
+    )
+    if optimizer_precision == "fp32":
+        return True
+    return False
+
+
+def cpu_offload_optimizer(
+    params_to_optimize,
+    optimizer_cls,
+    optimizer_parameters: dict,
+    offload_gradients: bool = True,
+    fused: bool = True,
+    offload_mechanism: str = None,
+):
+    if not offload_mechanism or offload_mechanism == "none":
+        return optimizer_cls(params_to_optimize, **optimizer_parameters)
+    if offload_mechanism != "torchao":
+        raise ValueError(
+            f"Unknown CPU optimiser offload mechanism: {offload_mechanism}"
+        )
+
+    if offload_gradients:
+        optimizer_parameters["offload_gradients"] = offload_gradients
+    if fused:
+        optimizer_parameters["fused"] = fused
+
+    optimizer_parameters["optimizer_class"] = optimizer_cls
+
+    return AOCPUOffloadOptimizer(params_to_optimize, **optimizer_parameters)
+
+
 def determine_optimizer_class_with_config(
     args, use_deepspeed_optimizer, is_quantized, enable_adamw_bf16
 ) -> tuple: