Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ fast_tests:
fast_tests_diffusers:
python -m pip install .[tests]
python -m pip install -r examples/stable-diffusion/requirements.txt
python -m pip install peft==0.16.0
python -m pytest tests/test_diffusers.py

# Run single-card non-regression tests on image classification models
Expand Down Expand Up @@ -86,7 +87,7 @@ slow_tests_custom_file_input: test_installs
slow_tests_1x: test_installs
@status1=0; status2=0; status3=0; \
python -m pytest tests/test_examples.py -v -s -k "single_card" || status1=$$?; \
python -m pip install peft==0.10.0; \
python -m pip install peft==0.12.0; \
python -m pytest tests/test_peft_inference.py || status2=$$?; \
python -m pytest tests/test_pipeline.py || status3=$$?; \
exit $$((status1 + status2 + status3))
Expand Down
15 changes: 10 additions & 5 deletions examples/stable-diffusion/text_to_image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,19 +576,24 @@ def main():
pipeline.unet.set_default_attn_processor(pipeline.unet)

if args.unet_adapter_name_or_path is not None:
from peft import PeftModel
from peft import PeftModel, tuners

tuners.boft.layer._FBD_CUDA = False

pipeline.unet = PeftModel.from_pretrained(pipeline.unet, args.unet_adapter_name_or_path)
pipeline.unet = pipeline.unet.merge_and_unload()
with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16):
pipeline.unet = pipeline.unet.merge_and_unload()
Comment thread
regisss marked this conversation as resolved.

if args.text_encoder_adapter_name_or_path is not None:
from peft import PeftModel
from peft import PeftModel, tuners

tuners.boft.layer._FBD_CUDA = False

pipeline.text_encoder = PeftModel.from_pretrained(
pipeline.text_encoder, args.text_encoder_adapter_name_or_path
)
pipeline.text_encoder = pipeline.text_encoder.merge_and_unload()

with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=args.bf16):
pipeline.text_encoder = pipeline.text_encoder.merge_and_unload()
else:
# SD LDM3D use-case
from optimum.habana.diffusers import GaudiStableDiffusionLDM3DPipeline as GaudiStableDiffusionPipeline
Expand Down
10 changes: 7 additions & 3 deletions examples/stable-diffusion/training/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ PT_HPU_LAZY_MODE=1 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dr
--mixed_precision=bf16 \
--use_hpu_graphs_for_training \
--use_hpu_graphs_for_inference \
--sdp_on_bf16 \
--gaudi_config_name Habana/stable-diffusion \
full
```
Expand All @@ -257,7 +258,7 @@ generate any additional images needed to meet the `num_class_images` requirement

### PEFT Model Fine-Tuning

We provide DreamBooth examples demonstrating how to use LoRA, LoKR, LoHA, and OFT adapters to fine-tune the
We provide DreamBooth examples demonstrating how to use LoRA, LoKR, LoHA, OFT and BOFT adapters to fine-tune the
UNet or text encoder.

To run the multi-card training, use:
Expand All @@ -283,6 +284,7 @@ PT_HPU_LAZY_MODE=1 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dr
--mixed_precision=bf16 \
--use_hpu_graphs_for_training \
--use_hpu_graphs_for_inference \
--sdp_on_bf16 \
--gaudi_config_name Habana/stable-diffusion \
lora --unet_r 8 --unet_alpha 8
```
Expand All @@ -291,7 +293,7 @@ PT_HPU_LAZY_MODE=1 python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dr
> When using PEFT method we can use a much higher learning rate compared to vanilla dreambooth.
> Here we use `1e-4` instead of the usual `5e-6`

Similar command could be applied with `loha`, `lokr`, or `oft` adapters.
Similar command could be applied with `loha`, `lokr`, `oft` or `boft` adapters.

You could check each adapter's specific arguments with `--help`, for example:

Expand All @@ -300,7 +302,8 @@ python train_dreambooth.py oft --help
```

> [!WARNING]
> Currently, the `oft` adapter is not supported in HPU graph mode, as it triggers `torch.inverse`,
> Currently, the `oft` and `boft` adapter are not supported in HPU graph mode, as it triggers `torch.inverse` `torch.linalg.solve`,

> causing a CPU fallback that is incompatible with HPU graph capturing.

After training completes, you can use `text_to_image_generation.py` sample for inference as follows:
Expand Down Expand Up @@ -346,6 +349,7 @@ PT_HPU_LAZY_MODE=1 python train_dreambooth_lora_sdxl.py \
--seed=0 \
--use_hpu_graphs_for_inference \
--use_hpu_graphs_for_training \
--sdp_on_bf16 \
--gaudi_config_name Habana/stable-diffusion
```

Expand Down
3 changes: 2 additions & 1 deletion examples/stable-diffusion/training/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ compel
datasets
imagesize
opencv-python
peft==0.10.0
peft==0.16.0
sentencepiece
tensorboard==2.19.0
84 changes: 81 additions & 3 deletions examples/stable-diffusion/training/train_dreambooth.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
from diffusers.utils.torch_utils import is_compiled_module
from habana_frameworks.torch.hpu import memory_stats
from huggingface_hub import HfApi
from peft import LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, get_peft_model
from peft import BOFTConfig, LoHaConfig, LoKrConfig, LoraConfig, OFTConfig, get_peft_model, tuners
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
Expand Down Expand Up @@ -108,7 +108,9 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
raise ValueError(f"{model_class} is not supported.")


def create_unet_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, LoHaConfig, LoKrConfig, OFTConfig]:
def create_unet_adapter_config(
args: argparse.Namespace,
) -> Union[LoraConfig, LoHaConfig, LoKrConfig, OFTConfig, BOFTConfig]:
if args.adapter == "full":
raise ValueError("Cannot create unet adapter config for full parameter")

Expand Down Expand Up @@ -151,7 +153,22 @@ def create_unet_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, Lo
init_weights=True,
coft=args.unet_use_coft,
eps=args.unet_eps,
oft_block_size=0,
)
elif args.adapter == "boft":
config = BOFTConfig(
boft_block_size=args.unet_block_size,
boft_block_num=args.unet_block_num,
boft_n_butterfly_factor=args.unet_n_butterfly_factor,
target_modules=UNET_TARGET_MODULES,
boft_dropout=args.unet_dropout,
bias=args.unet_bias,
)
from optimum.habana.peft.layer import GaudiBoftLinearForward

tuners.boft.layer.Linear.forward = GaudiBoftLinearForward
tuners.boft.layer._FBD_CUDA = False

else:
raise ValueError(f"Unknown adapter type {args.adapter}")

Expand All @@ -160,7 +177,7 @@ def create_unet_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, Lo

def create_text_encoder_adapter_config(
args: argparse.Namespace,
) -> Union[LoraConfig, LoHaConfig, LoKrConfig, OFTConfig]:
) -> Union[LoraConfig, LoHaConfig, LoKrConfig, OFTConfig, BOFTConfig]:
if args.adapter == "full":
raise ValueError("Cannot create text_encoder adapter config for full parameter")

Expand Down Expand Up @@ -201,7 +218,21 @@ def create_text_encoder_adapter_config(
init_weights=True,
coft=args.te_use_coft,
eps=args.te_eps,
oft_block_size=0,
)
elif args.adapter == "boft":
config = BOFTConfig(
boft_block_size=args.te_block_size,
boft_block_num=args.te_block_num,
boft_n_butterfly_factor=args.te_n_butterfly_factor,
target_modules=TEXT_ENCODER_TARGET_MODULES,
boft_dropout=args.te_dropout,
bias=args.te_bias,
)
from optimum.habana.peft.layer import GaudiBoftLinearForward

tuners.boft.layer.Linear.forward = GaudiBoftLinearForward
tuners.boft.layer._FBD_CUDA = False
else:
raise ValueError(f"Unknown adapter type {args.adapter}")

Expand Down Expand Up @@ -479,6 +510,12 @@ def parse_args(input_args=None):
action="store_true",
help="Use HPU graphs for inference on HPU.",
)
parser.add_argument(
"--sdp_on_bf16",
action="store_true",
default=False,
help="Allow pyTorch to use reduced precision in the SDPA math backend",
)

# Adapter arguments
subparsers = parser.add_subparsers(dest="adapter")
Expand Down Expand Up @@ -632,6 +669,44 @@ def parse_args(input_args=None):
help="The control strength of COFT for text_encoder, only used if `train_text_encoder` is True",
)

# boft adapter
boft = subparsers.add_parser("boft", help="Use Boft adapter")
boft.add_argument("--unet_block_size", type=int, default=8, help="Boft block_size for unet")
boft.add_argument("--unet_block_num", type=int, default=0, help="Boft block_num for unet")
boft.add_argument("--unet_n_butterfly_factor", type=int, default=1, help="Boft n_butterfly_factor for unet")
boft.add_argument("--unet_dropout", type=float, default=0.1, help="Boft dropout for unet")
boft.add_argument("--unet_bias", type=str, default="boft_only", help="Boft bias for unet")
boft.add_argument(
"--te_block_size",
type=int,
default=8,
help="Boft block_size for text_encoder,only used if `train_text_encoder` is True",
)
boft.add_argument(
"--te_block_num",
type=int,
default=0,
help="Boft block_num for text_encoder,only used if `train_text_encoder` is True",
)
boft.add_argument(
"--te_n_butterfly_factor",
type=int,
default=1,
help="Boft n_butterfly_factor for text_encoder,only used if `train_text_encoder` is True",
)
boft.add_argument(
"--te_dropout",
type=float,
default=0.1,
help="Boft dropout for text_encoder,only used if `train_text_encoder` is True",
)
boft.add_argument(
"--te_bias",
type=str,
default="boft_only",
help="Boft bias for text_encoder, only used if `train_text_encoder` is True",
)

if input_args is not None:
args = parser.parse_args(input_args)
else:
Expand Down Expand Up @@ -875,6 +950,9 @@ def main(args):
if args.seed is not None:
set_seed(args.seed)

if args.sdp_on_bf16:
torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)

# Generate class images if prior preservation is enabled.
if args.with_prior_preservation:
class_images_dir = Path(args.class_data_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,12 @@ def parse_args(input_args=None):
action="store_true",
help="Use HPU graphs for inference on HPU.",
)
parser.add_argument(
"--sdp_on_bf16",
action="store_true",
default=False,
help="Allow pyTorch to use reduced precision in the SDPA math backend",
)

if input_args is not None:
args = parser.parse_args(input_args)
Expand Down Expand Up @@ -851,6 +857,9 @@ def main(args):
if args.seed is not None:
set_seed(args.seed)

if args.sdp_on_bf16:
torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)

# Generate class images if prior preservation is enabled.
if args.with_prior_preservation:
class_images_dir = Path(args.class_data_dir)
Expand Down
1 change: 1 addition & 0 deletions optimum/habana/peft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
GaudiAdaloraLayerSVDLinearForward,
GaudiAdaptedAttention_getattr,
GaudiAdaptedAttentionPreAttnForward,
GaudiBoftLinearForward,
GaudiPolyLayerLinearForward,
)
from .peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
66 changes: 66 additions & 0 deletions optimum/habana/peft/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,69 @@ def GaudiAdaptedAttention_getattr(self, name: str):
# This is necessary as e.g. causal models have various methods that we
# don't want to re-implement here.
return getattr(self.model, name)


def GaudiBoftLinearForward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
"""
Copied from Linear.forward: https://github.com/huggingface/peft/blob/v0.16.0/src/peft/tuners/boft/layer.py#L591
The only differences are:
- change the cast dtype logic to avoid error in HPU
"""
previous_dtype = x.dtype

if self.disable_adapters:
if self.merged:
self.unmerge()
result = self.base_layer(x, *args, **kwargs)
elif self.merged:
result = self.base_layer(x, *args, **kwargs)
else:
boft_rotation = torch.eye(self.in_features, device=x.device, dtype=previous_dtype)
boft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=previous_dtype)

for active_adapter in self.active_adapters:
if active_adapter not in self.boft_R.keys():
continue
boft_R = self.boft_R[active_adapter]
boft_s = self.boft_s[active_adapter]
dropout = self.boft_dropout[active_adapter]

N, D, H, _ = boft_R.shape
boft_R = boft_R.view(N * D, H, H)
orth_rotate_butterfly = self.cayley_batch(boft_R)
orth_rotate_butterfly = orth_rotate_butterfly.view(N, D, H, H)
orth_rotate_butterfly = dropout(orth_rotate_butterfly)
orth_rotate_butterfly = orth_rotate_butterfly.squeeze(0)
block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly))
block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0)

# The BOFT author's cayley_batch, dropout and FastBlockDiag ONLY return fp32 outputs.
boft_P = self.boft_P.to(x)
block_diagonal_butterfly = block_diagonal_butterfly.to(x)
butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1))
butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch)
butterfly_oft_mat = butterfly_oft_mat_batch[0]

for i in range(1, butterfly_oft_mat_batch.shape[0]):
butterfly_oft_mat = butterfly_oft_mat_batch[i] @ butterfly_oft_mat

boft_rotation = butterfly_oft_mat @ boft_rotation
boft_scale = boft_s * boft_scale

x = x.to(self.get_base_layer().weight.data.dtype)

orig_weight = self.get_base_layer().weight.data
orig_weight = torch.transpose(orig_weight, 0, 1)
boft_rotation = boft_rotation.to(previous_dtype)
orig_weight = orig_weight.to(previous_dtype)
rotated_weight = torch.mm(boft_rotation, orig_weight)
rotated_weight = torch.transpose(rotated_weight, 0, 1)

scaled_rotated_weight = rotated_weight * boft_scale

scaled_rotated_weight = scaled_rotated_weight.to(previous_dtype)
bias = self._cast_input_dtype(self.base_layer.bias, scaled_rotated_weight.dtype)
result = F.linear(input=x, weight=scaled_rotated_weight, bias=bias)

result = result.to(previous_dtype)
return result
Loading
Loading